// SPDX-License-Identifier: GPL-2.0-only1/*2* linux/mm/filemap.c3*4* Copyright (C) 1994-1999 Linus Torvalds5*/67/*8* This file handles the generic file mmap semantics used by9* most "normal" filesystems (but you don't /have/ to use this:10* the NFS filesystem used to do this differently, for example)11*/12#include <linux/export.h>13#include <linux/compiler.h>14#include <linux/dax.h>15#include <linux/fs.h>16#include <linux/sched/signal.h>17#include <linux/uaccess.h>18#include <linux/capability.h>19#include <linux/kernel_stat.h>20#include <linux/gfp.h>21#include <linux/mm.h>22#include <linux/swap.h>23#include <linux/swapops.h>24#include <linux/syscalls.h>25#include <linux/mman.h>26#include <linux/pagemap.h>27#include <linux/file.h>28#include <linux/uio.h>29#include <linux/error-injection.h>30#include <linux/hash.h>31#include <linux/writeback.h>32#include <linux/backing-dev.h>33#include <linux/pagevec.h>34#include <linux/security.h>35#include <linux/cpuset.h>36#include <linux/hugetlb.h>37#include <linux/memcontrol.h>38#include <linux/shmem_fs.h>39#include <linux/rmap.h>40#include <linux/delayacct.h>41#include <linux/psi.h>42#include <linux/ramfs.h>43#include <linux/page_idle.h>44#include <linux/migrate.h>45#include <linux/pipe_fs_i.h>46#include <linux/splice.h>47#include <linux/rcupdate_wait.h>48#include <linux/sched/mm.h>49#include <linux/sysctl.h>50#include <asm/pgalloc.h>51#include <asm/tlbflush.h>52#include "internal.h"5354#define CREATE_TRACE_POINTS55#include <trace/events/filemap.h>5657/*58* FIXME: remove all knowledge of the buffer layer from the core VM59*/60#include <linux/buffer_head.h> /* for try_to_free_buffers */6162#include <asm/mman.h>6364#include "swap.h"6566/*67* Shared mappings implemented 30.11.1994. It's not fully working yet,68* though.69*70* Shared mappings now work. 15.8.1995 Bruno.71*72* finished 'unifying' the page and buffer cache and SMP-threaded the73* page-cache, 21.05.1999, Ingo Molnar <[email protected]>74*75* SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <[email protected]>76*/7778/*79* Lock ordering:80*81* ->i_mmap_rwsem (truncate_pagecache)82* ->private_lock (__free_pte->block_dirty_folio)83* ->swap_lock (exclusive_swap_page, others)84* ->i_pages lock85*86* ->i_rwsem87* ->invalidate_lock (acquired by fs in truncate path)88* ->i_mmap_rwsem (truncate->unmap_mapping_range)89*90* ->mmap_lock91* ->i_mmap_rwsem92* ->page_table_lock or pte_lock (various, mainly in memory.c)93* ->i_pages lock (arch-dependent flush_dcache_mmap_lock)94*95* ->mmap_lock96* ->invalidate_lock (filemap_fault)97* ->lock_page (filemap_fault, access_process_vm)98*99* ->i_rwsem (generic_perform_write)100* ->mmap_lock (fault_in_readable->do_page_fault)101*102* bdi->wb.list_lock103* sb_lock (fs/fs-writeback.c)104* ->i_pages lock (__sync_single_inode)105*106* ->i_mmap_rwsem107* ->anon_vma.lock (vma_merge)108*109* ->anon_vma.lock110* ->page_table_lock or pte_lock (anon_vma_prepare and various)111*112* ->page_table_lock or pte_lock113* ->swap_lock (try_to_unmap_one)114* ->private_lock (try_to_unmap_one)115* ->i_pages lock (try_to_unmap_one)116* ->lruvec->lru_lock (follow_page_mask->mark_page_accessed)117* ->lruvec->lru_lock (check_pte_range->folio_isolate_lru)118* ->private_lock (folio_remove_rmap_pte->set_page_dirty)119* ->i_pages lock (folio_remove_rmap_pte->set_page_dirty)120* bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty)121* ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty)122* bdi.wb->list_lock (zap_pte_range->set_page_dirty)123* ->inode->i_lock (zap_pte_range->set_page_dirty)124* ->private_lock (zap_pte_range->block_dirty_folio)125*/126127static void page_cache_delete(struct address_space *mapping,128struct folio *folio, void *shadow)129{130XA_STATE(xas, &mapping->i_pages, folio->index);131long nr = 1;132133mapping_set_update(&xas, mapping);134135xas_set_order(&xas, folio->index, folio_order(folio));136nr = folio_nr_pages(folio);137138VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);139140xas_store(&xas, shadow);141xas_init_marks(&xas);142143folio->mapping = NULL;144/* Leave folio->index set: truncation lookup relies upon it */145mapping->nrpages -= nr;146}147148static void filemap_unaccount_folio(struct address_space *mapping,149struct folio *folio)150{151long nr;152153VM_BUG_ON_FOLIO(folio_mapped(folio), folio);154if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {155pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n",156current->comm, folio_pfn(folio));157dump_page(&folio->page, "still mapped when deleted");158dump_stack();159add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);160161if (mapping_exiting(mapping) && !folio_test_large(folio)) {162int mapcount = folio_mapcount(folio);163164if (folio_ref_count(folio) >= mapcount + 2) {165/*166* All vmas have already been torn down, so it's167* a good bet that actually the page is unmapped168* and we'd rather not leak it: if we're wrong,169* another bad page check should catch it later.170*/171atomic_set(&folio->_mapcount, -1);172folio_ref_sub(folio, mapcount);173}174}175}176177/* hugetlb folios do not participate in page cache accounting. */178if (folio_test_hugetlb(folio))179return;180181nr = folio_nr_pages(folio);182183__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);184if (folio_test_swapbacked(folio)) {185__lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);186if (folio_test_pmd_mappable(folio))187__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);188} else if (folio_test_pmd_mappable(folio)) {189__lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);190filemap_nr_thps_dec(mapping);191}192if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags))193mod_node_page_state(folio_pgdat(folio),194NR_KERNEL_FILE_PAGES, -nr);195196/*197* At this point folio must be either written or cleaned by198* truncate. Dirty folio here signals a bug and loss of199* unwritten data - on ordinary filesystems.200*201* But it's harmless on in-memory filesystems like tmpfs; and can202* occur when a driver which did get_user_pages() sets page dirty203* before putting it, while the inode is being finally evicted.204*205* Below fixes dirty accounting after removing the folio entirely206* but leaves the dirty flag set: it has no effect for truncated207* folio and anyway will be cleared before returning folio to208* buddy allocator.209*/210if (WARN_ON_ONCE(folio_test_dirty(folio) &&211mapping_can_writeback(mapping)))212folio_account_cleaned(folio, inode_to_wb(mapping->host));213}214215/*216* Delete a page from the page cache and free it. Caller has to make217* sure the page is locked and that nobody else uses it - or that usage218* is safe. The caller must hold the i_pages lock.219*/220void __filemap_remove_folio(struct folio *folio, void *shadow)221{222struct address_space *mapping = folio->mapping;223224trace_mm_filemap_delete_from_page_cache(folio);225filemap_unaccount_folio(mapping, folio);226page_cache_delete(mapping, folio, shadow);227}228229void filemap_free_folio(struct address_space *mapping, struct folio *folio)230{231void (*free_folio)(struct folio *);232233free_folio = mapping->a_ops->free_folio;234if (free_folio)235free_folio(folio);236237folio_put_refs(folio, folio_nr_pages(folio));238}239240/**241* filemap_remove_folio - Remove folio from page cache.242* @folio: The folio.243*244* This must be called only on folios that are locked and have been245* verified to be in the page cache. It will never put the folio into246* the free list because the caller has a reference on the page.247*/248void filemap_remove_folio(struct folio *folio)249{250struct address_space *mapping = folio->mapping;251252BUG_ON(!folio_test_locked(folio));253spin_lock(&mapping->host->i_lock);254xa_lock_irq(&mapping->i_pages);255__filemap_remove_folio(folio, NULL);256xa_unlock_irq(&mapping->i_pages);257if (mapping_shrinkable(mapping))258inode_add_lru(mapping->host);259spin_unlock(&mapping->host->i_lock);260261filemap_free_folio(mapping, folio);262}263264/*265* page_cache_delete_batch - delete several folios from page cache266* @mapping: the mapping to which folios belong267* @fbatch: batch of folios to delete268*269* The function walks over mapping->i_pages and removes folios passed in270* @fbatch from the mapping. The function expects @fbatch to be sorted271* by page index and is optimised for it to be dense.272* It tolerates holes in @fbatch (mapping entries at those indices are not273* modified).274*275* The function expects the i_pages lock to be held.276*/277static void page_cache_delete_batch(struct address_space *mapping,278struct folio_batch *fbatch)279{280XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);281long total_pages = 0;282int i = 0;283struct folio *folio;284285mapping_set_update(&xas, mapping);286xas_for_each(&xas, folio, ULONG_MAX) {287if (i >= folio_batch_count(fbatch))288break;289290/* A swap/dax/shadow entry got inserted? Skip it. */291if (xa_is_value(folio))292continue;293/*294* A page got inserted in our range? Skip it. We have our295* pages locked so they are protected from being removed.296* If we see a page whose index is higher than ours, it297* means our page has been removed, which shouldn't be298* possible because we're holding the PageLock.299*/300if (folio != fbatch->folios[i]) {301VM_BUG_ON_FOLIO(folio->index >302fbatch->folios[i]->index, folio);303continue;304}305306WARN_ON_ONCE(!folio_test_locked(folio));307308folio->mapping = NULL;309/* Leave folio->index set: truncation lookup relies on it */310311i++;312xas_store(&xas, NULL);313total_pages += folio_nr_pages(folio);314}315mapping->nrpages -= total_pages;316}317318void delete_from_page_cache_batch(struct address_space *mapping,319struct folio_batch *fbatch)320{321int i;322323if (!folio_batch_count(fbatch))324return;325326spin_lock(&mapping->host->i_lock);327xa_lock_irq(&mapping->i_pages);328for (i = 0; i < folio_batch_count(fbatch); i++) {329struct folio *folio = fbatch->folios[i];330331trace_mm_filemap_delete_from_page_cache(folio);332filemap_unaccount_folio(mapping, folio);333}334page_cache_delete_batch(mapping, fbatch);335xa_unlock_irq(&mapping->i_pages);336if (mapping_shrinkable(mapping))337inode_add_lru(mapping->host);338spin_unlock(&mapping->host->i_lock);339340for (i = 0; i < folio_batch_count(fbatch); i++)341filemap_free_folio(mapping, fbatch->folios[i]);342}343344int filemap_check_errors(struct address_space *mapping)345{346int ret = 0;347/* Check for outstanding write errors */348if (test_bit(AS_ENOSPC, &mapping->flags) &&349test_and_clear_bit(AS_ENOSPC, &mapping->flags))350ret = -ENOSPC;351if (test_bit(AS_EIO, &mapping->flags) &&352test_and_clear_bit(AS_EIO, &mapping->flags))353ret = -EIO;354return ret;355}356EXPORT_SYMBOL(filemap_check_errors);357358static int filemap_check_and_keep_errors(struct address_space *mapping)359{360/* Check for outstanding write errors */361if (test_bit(AS_EIO, &mapping->flags))362return -EIO;363if (test_bit(AS_ENOSPC, &mapping->flags))364return -ENOSPC;365return 0;366}367368/**369* filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range370* @mapping: address space structure to write371* @wbc: the writeback_control controlling the writeout372*373* Call writepages on the mapping using the provided wbc to control the374* writeout.375*376* Return: %0 on success, negative error code otherwise.377*/378int filemap_fdatawrite_wbc(struct address_space *mapping,379struct writeback_control *wbc)380{381int ret;382383if (!mapping_can_writeback(mapping) ||384!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))385return 0;386387wbc_attach_fdatawrite_inode(wbc, mapping->host);388ret = do_writepages(mapping, wbc);389wbc_detach_inode(wbc);390return ret;391}392EXPORT_SYMBOL(filemap_fdatawrite_wbc);393394/**395* __filemap_fdatawrite_range - start writeback on mapping dirty pages in range396* @mapping: address space structure to write397* @start: offset in bytes where the range starts398* @end: offset in bytes where the range ends (inclusive)399* @sync_mode: enable synchronous operation400*401* Start writeback against all of a mapping's dirty pages that lie402* within the byte offsets <start, end> inclusive.403*404* If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as405* opposed to a regular memory cleansing writeback. The difference between406* these two operations is that if a dirty page/buffer is encountered, it must407* be waited upon, and not just skipped over.408*409* Return: %0 on success, negative error code otherwise.410*/411int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,412loff_t end, int sync_mode)413{414struct writeback_control wbc = {415.sync_mode = sync_mode,416.nr_to_write = LONG_MAX,417.range_start = start,418.range_end = end,419};420421return filemap_fdatawrite_wbc(mapping, &wbc);422}423424static inline int __filemap_fdatawrite(struct address_space *mapping,425int sync_mode)426{427return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);428}429430int filemap_fdatawrite(struct address_space *mapping)431{432return __filemap_fdatawrite(mapping, WB_SYNC_ALL);433}434EXPORT_SYMBOL(filemap_fdatawrite);435436int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,437loff_t end)438{439return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);440}441EXPORT_SYMBOL(filemap_fdatawrite_range);442443/**444* filemap_fdatawrite_range_kick - start writeback on a range445* @mapping: target address_space446* @start: index to start writeback on447* @end: last (inclusive) index for writeback448*449* This is a non-integrity writeback helper, to start writing back folios450* for the indicated range.451*452* Return: %0 on success, negative error code otherwise.453*/454int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start,455loff_t end)456{457return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE);458}459EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick);460461/**462* filemap_flush - mostly a non-blocking flush463* @mapping: target address_space464*465* This is a mostly non-blocking flush. Not suitable for data-integrity466* purposes - I/O may not be started against all dirty pages.467*468* Return: %0 on success, negative error code otherwise.469*/470int filemap_flush(struct address_space *mapping)471{472return __filemap_fdatawrite(mapping, WB_SYNC_NONE);473}474EXPORT_SYMBOL(filemap_flush);475476/**477* filemap_range_has_page - check if a page exists in range.478* @mapping: address space within which to check479* @start_byte: offset in bytes where the range starts480* @end_byte: offset in bytes where the range ends (inclusive)481*482* Find at least one page in the range supplied, usually used to check if483* direct writing in this range will trigger a writeback.484*485* Return: %true if at least one page exists in the specified range,486* %false otherwise.487*/488bool filemap_range_has_page(struct address_space *mapping,489loff_t start_byte, loff_t end_byte)490{491struct folio *folio;492XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);493pgoff_t max = end_byte >> PAGE_SHIFT;494495if (end_byte < start_byte)496return false;497498rcu_read_lock();499for (;;) {500folio = xas_find(&xas, max);501if (xas_retry(&xas, folio))502continue;503/* Shadow entries don't count */504if (xa_is_value(folio))505continue;506/*507* We don't need to try to pin this page; we're about to508* release the RCU lock anyway. It is enough to know that509* there was a page here recently.510*/511break;512}513rcu_read_unlock();514515return folio != NULL;516}517EXPORT_SYMBOL(filemap_range_has_page);518519static void __filemap_fdatawait_range(struct address_space *mapping,520loff_t start_byte, loff_t end_byte)521{522pgoff_t index = start_byte >> PAGE_SHIFT;523pgoff_t end = end_byte >> PAGE_SHIFT;524struct folio_batch fbatch;525unsigned nr_folios;526527folio_batch_init(&fbatch);528529while (index <= end) {530unsigned i;531532nr_folios = filemap_get_folios_tag(mapping, &index, end,533PAGECACHE_TAG_WRITEBACK, &fbatch);534535if (!nr_folios)536break;537538for (i = 0; i < nr_folios; i++) {539struct folio *folio = fbatch.folios[i];540541folio_wait_writeback(folio);542}543folio_batch_release(&fbatch);544cond_resched();545}546}547548/**549* filemap_fdatawait_range - wait for writeback to complete550* @mapping: address space structure to wait for551* @start_byte: offset in bytes where the range starts552* @end_byte: offset in bytes where the range ends (inclusive)553*554* Walk the list of under-writeback pages of the given address space555* in the given range and wait for all of them. Check error status of556* the address space and return it.557*558* Since the error status of the address space is cleared by this function,559* callers are responsible for checking the return value and handling and/or560* reporting the error.561*562* Return: error status of the address space.563*/564int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,565loff_t end_byte)566{567__filemap_fdatawait_range(mapping, start_byte, end_byte);568return filemap_check_errors(mapping);569}570EXPORT_SYMBOL(filemap_fdatawait_range);571572/**573* filemap_fdatawait_range_keep_errors - wait for writeback to complete574* @mapping: address space structure to wait for575* @start_byte: offset in bytes where the range starts576* @end_byte: offset in bytes where the range ends (inclusive)577*578* Walk the list of under-writeback pages of the given address space in the579* given range and wait for all of them. Unlike filemap_fdatawait_range(),580* this function does not clear error status of the address space.581*582* Use this function if callers don't handle errors themselves. Expected583* call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),584* fsfreeze(8)585*/586int filemap_fdatawait_range_keep_errors(struct address_space *mapping,587loff_t start_byte, loff_t end_byte)588{589__filemap_fdatawait_range(mapping, start_byte, end_byte);590return filemap_check_and_keep_errors(mapping);591}592EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);593594/**595* file_fdatawait_range - wait for writeback to complete596* @file: file pointing to address space structure to wait for597* @start_byte: offset in bytes where the range starts598* @end_byte: offset in bytes where the range ends (inclusive)599*600* Walk the list of under-writeback pages of the address space that file601* refers to, in the given range and wait for all of them. Check error602* status of the address space vs. the file->f_wb_err cursor and return it.603*604* Since the error status of the file is advanced by this function,605* callers are responsible for checking the return value and handling and/or606* reporting the error.607*608* Return: error status of the address space vs. the file->f_wb_err cursor.609*/610int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)611{612struct address_space *mapping = file->f_mapping;613614__filemap_fdatawait_range(mapping, start_byte, end_byte);615return file_check_and_advance_wb_err(file);616}617EXPORT_SYMBOL(file_fdatawait_range);618619/**620* filemap_fdatawait_keep_errors - wait for writeback without clearing errors621* @mapping: address space structure to wait for622*623* Walk the list of under-writeback pages of the given address space624* and wait for all of them. Unlike filemap_fdatawait(), this function625* does not clear error status of the address space.626*627* Use this function if callers don't handle errors themselves. Expected628* call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),629* fsfreeze(8)630*631* Return: error status of the address space.632*/633int filemap_fdatawait_keep_errors(struct address_space *mapping)634{635__filemap_fdatawait_range(mapping, 0, LLONG_MAX);636return filemap_check_and_keep_errors(mapping);637}638EXPORT_SYMBOL(filemap_fdatawait_keep_errors);639640/* Returns true if writeback might be needed or already in progress. */641static bool mapping_needs_writeback(struct address_space *mapping)642{643return mapping->nrpages;644}645646bool filemap_range_has_writeback(struct address_space *mapping,647loff_t start_byte, loff_t end_byte)648{649XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);650pgoff_t max = end_byte >> PAGE_SHIFT;651struct folio *folio;652653if (end_byte < start_byte)654return false;655656rcu_read_lock();657xas_for_each(&xas, folio, max) {658if (xas_retry(&xas, folio))659continue;660if (xa_is_value(folio))661continue;662if (folio_test_dirty(folio) || folio_test_locked(folio) ||663folio_test_writeback(folio))664break;665}666rcu_read_unlock();667return folio != NULL;668}669EXPORT_SYMBOL_GPL(filemap_range_has_writeback);670671/**672* filemap_write_and_wait_range - write out & wait on a file range673* @mapping: the address_space for the pages674* @lstart: offset in bytes where the range starts675* @lend: offset in bytes where the range ends (inclusive)676*677* Write out and wait upon file offsets lstart->lend, inclusive.678*679* Note that @lend is inclusive (describes the last byte to be written) so680* that this function can be used to write to the very end-of-file (end = -1).681*682* Return: error status of the address space.683*/684int filemap_write_and_wait_range(struct address_space *mapping,685loff_t lstart, loff_t lend)686{687int err = 0, err2;688689if (lend < lstart)690return 0;691692if (mapping_needs_writeback(mapping)) {693err = __filemap_fdatawrite_range(mapping, lstart, lend,694WB_SYNC_ALL);695/*696* Even if the above returned error, the pages may be697* written partially (e.g. -ENOSPC), so we wait for it.698* But the -EIO is special case, it may indicate the worst699* thing (e.g. bug) happened, so we avoid waiting for it.700*/701if (err != -EIO)702__filemap_fdatawait_range(mapping, lstart, lend);703}704err2 = filemap_check_errors(mapping);705if (!err)706err = err2;707return err;708}709EXPORT_SYMBOL(filemap_write_and_wait_range);710711void __filemap_set_wb_err(struct address_space *mapping, int err)712{713errseq_t eseq = errseq_set(&mapping->wb_err, err);714715trace_filemap_set_wb_err(mapping, eseq);716}717EXPORT_SYMBOL(__filemap_set_wb_err);718719/**720* file_check_and_advance_wb_err - report wb error (if any) that was previously721* and advance wb_err to current one722* @file: struct file on which the error is being reported723*724* When userland calls fsync (or something like nfsd does the equivalent), we725* want to report any writeback errors that occurred since the last fsync (or726* since the file was opened if there haven't been any).727*728* Grab the wb_err from the mapping. If it matches what we have in the file,729* then just quickly return 0. The file is all caught up.730*731* If it doesn't match, then take the mapping value, set the "seen" flag in732* it and try to swap it into place. If it works, or another task beat us733* to it with the new value, then update the f_wb_err and return the error734* portion. The error at this point must be reported via proper channels735* (a'la fsync, or NFS COMMIT operation, etc.).736*737* While we handle mapping->wb_err with atomic operations, the f_wb_err738* value is protected by the f_lock since we must ensure that it reflects739* the latest value swapped in for this file descriptor.740*741* Return: %0 on success, negative error code otherwise.742*/743int file_check_and_advance_wb_err(struct file *file)744{745int err = 0;746errseq_t old = READ_ONCE(file->f_wb_err);747struct address_space *mapping = file->f_mapping;748749/* Locklessly handle the common case where nothing has changed */750if (errseq_check(&mapping->wb_err, old)) {751/* Something changed, must use slow path */752spin_lock(&file->f_lock);753old = file->f_wb_err;754err = errseq_check_and_advance(&mapping->wb_err,755&file->f_wb_err);756trace_file_check_and_advance_wb_err(file, old);757spin_unlock(&file->f_lock);758}759760/*761* We're mostly using this function as a drop in replacement for762* filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect763* that the legacy code would have had on these flags.764*/765clear_bit(AS_EIO, &mapping->flags);766clear_bit(AS_ENOSPC, &mapping->flags);767return err;768}769EXPORT_SYMBOL(file_check_and_advance_wb_err);770771/**772* file_write_and_wait_range - write out & wait on a file range773* @file: file pointing to address_space with pages774* @lstart: offset in bytes where the range starts775* @lend: offset in bytes where the range ends (inclusive)776*777* Write out and wait upon file offsets lstart->lend, inclusive.778*779* Note that @lend is inclusive (describes the last byte to be written) so780* that this function can be used to write to the very end-of-file (end = -1).781*782* After writing out and waiting on the data, we check and advance the783* f_wb_err cursor to the latest value, and return any errors detected there.784*785* Return: %0 on success, negative error code otherwise.786*/787int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)788{789int err = 0, err2;790struct address_space *mapping = file->f_mapping;791792if (lend < lstart)793return 0;794795if (mapping_needs_writeback(mapping)) {796err = __filemap_fdatawrite_range(mapping, lstart, lend,797WB_SYNC_ALL);798/* See comment of filemap_write_and_wait() */799if (err != -EIO)800__filemap_fdatawait_range(mapping, lstart, lend);801}802err2 = file_check_and_advance_wb_err(file);803if (!err)804err = err2;805return err;806}807EXPORT_SYMBOL(file_write_and_wait_range);808809/**810* replace_page_cache_folio - replace a pagecache folio with a new one811* @old: folio to be replaced812* @new: folio to replace with813*814* This function replaces a folio in the pagecache with a new one. On815* success it acquires the pagecache reference for the new folio and816* drops it for the old folio. Both the old and new folios must be817* locked. This function does not add the new folio to the LRU, the818* caller must do that.819*820* The remove + add is atomic. This function cannot fail.821*/822void replace_page_cache_folio(struct folio *old, struct folio *new)823{824struct address_space *mapping = old->mapping;825void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;826pgoff_t offset = old->index;827XA_STATE(xas, &mapping->i_pages, offset);828829VM_BUG_ON_FOLIO(!folio_test_locked(old), old);830VM_BUG_ON_FOLIO(!folio_test_locked(new), new);831VM_BUG_ON_FOLIO(new->mapping, new);832833folio_get(new);834new->mapping = mapping;835new->index = offset;836837mem_cgroup_replace_folio(old, new);838839xas_lock_irq(&xas);840xas_store(&xas, new);841842old->mapping = NULL;843/* hugetlb pages do not participate in page cache accounting. */844if (!folio_test_hugetlb(old))845__lruvec_stat_sub_folio(old, NR_FILE_PAGES);846if (!folio_test_hugetlb(new))847__lruvec_stat_add_folio(new, NR_FILE_PAGES);848if (folio_test_swapbacked(old))849__lruvec_stat_sub_folio(old, NR_SHMEM);850if (folio_test_swapbacked(new))851__lruvec_stat_add_folio(new, NR_SHMEM);852xas_unlock_irq(&xas);853if (free_folio)854free_folio(old);855folio_put(old);856}857EXPORT_SYMBOL_GPL(replace_page_cache_folio);858859noinline int __filemap_add_folio(struct address_space *mapping,860struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)861{862XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));863bool huge;864long nr;865unsigned int forder = folio_order(folio);866867VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);868VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);869VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping),870folio);871mapping_set_update(&xas, mapping);872873VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);874huge = folio_test_hugetlb(folio);875nr = folio_nr_pages(folio);876877gfp &= GFP_RECLAIM_MASK;878folio_ref_add(folio, nr);879folio->mapping = mapping;880folio->index = xas.xa_index;881882for (;;) {883int order = -1;884void *entry, *old = NULL;885886xas_lock_irq(&xas);887xas_for_each_conflict(&xas, entry) {888old = entry;889if (!xa_is_value(entry)) {890xas_set_err(&xas, -EEXIST);891goto unlock;892}893/*894* If a larger entry exists,895* it will be the first and only entry iterated.896*/897if (order == -1)898order = xas_get_order(&xas);899}900901if (old) {902if (order > 0 && order > forder) {903unsigned int split_order = max(forder,904xas_try_split_min_order(order));905906/* How to handle large swap entries? */907BUG_ON(shmem_mapping(mapping));908909while (order > forder) {910xas_set_order(&xas, index, split_order);911xas_try_split(&xas, old, order);912if (xas_error(&xas))913goto unlock;914order = split_order;915split_order =916max(xas_try_split_min_order(917split_order),918forder);919}920xas_reset(&xas);921}922if (shadowp)923*shadowp = old;924}925926xas_store(&xas, folio);927if (xas_error(&xas))928goto unlock;929930mapping->nrpages += nr;931932/* hugetlb pages do not participate in page cache accounting */933if (!huge) {934__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);935if (folio_test_pmd_mappable(folio))936__lruvec_stat_mod_folio(folio,937NR_FILE_THPS, nr);938}939940unlock:941xas_unlock_irq(&xas);942943if (!xas_nomem(&xas, gfp))944break;945}946947if (xas_error(&xas))948goto error;949950trace_mm_filemap_add_to_page_cache(folio);951return 0;952error:953folio->mapping = NULL;954/* Leave folio->index set: truncation relies upon it */955folio_put_refs(folio, nr);956return xas_error(&xas);957}958ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);959960int filemap_add_folio(struct address_space *mapping, struct folio *folio,961pgoff_t index, gfp_t gfp)962{963void *shadow = NULL;964int ret;965struct mem_cgroup *tmp;966bool kernel_file = test_bit(AS_KERNEL_FILE, &mapping->flags);967968if (kernel_file)969tmp = set_active_memcg(root_mem_cgroup);970ret = mem_cgroup_charge(folio, NULL, gfp);971if (kernel_file)972set_active_memcg(tmp);973if (ret)974return ret;975976__folio_set_locked(folio);977ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);978if (unlikely(ret)) {979mem_cgroup_uncharge(folio);980__folio_clear_locked(folio);981} else {982/*983* The folio might have been evicted from cache only984* recently, in which case it should be activated like985* any other repeatedly accessed folio.986* The exception is folios getting rewritten; evicting other987* data from the working set, only to cache data that will988* get overwritten with something else, is a waste of memory.989*/990WARN_ON_ONCE(folio_test_active(folio));991if (!(gfp & __GFP_WRITE) && shadow)992workingset_refault(folio, shadow);993folio_add_lru(folio);994if (kernel_file)995mod_node_page_state(folio_pgdat(folio),996NR_KERNEL_FILE_PAGES,997folio_nr_pages(folio));998}999return ret;1000}1001EXPORT_SYMBOL_GPL(filemap_add_folio);10021003#ifdef CONFIG_NUMA1004struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)1005{1006int n;1007struct folio *folio;10081009if (cpuset_do_page_mem_spread()) {1010unsigned int cpuset_mems_cookie;1011do {1012cpuset_mems_cookie = read_mems_allowed_begin();1013n = cpuset_mem_spread_node();1014folio = __folio_alloc_node_noprof(gfp, order, n);1015} while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));10161017return folio;1018}1019return folio_alloc_noprof(gfp, order);1020}1021EXPORT_SYMBOL(filemap_alloc_folio_noprof);1022#endif10231024/*1025* filemap_invalidate_lock_two - lock invalidate_lock for two mappings1026*1027* Lock exclusively invalidate_lock of any passed mapping that is not NULL.1028*1029* @mapping1: the first mapping to lock1030* @mapping2: the second mapping to lock1031*/1032void filemap_invalidate_lock_two(struct address_space *mapping1,1033struct address_space *mapping2)1034{1035if (mapping1 > mapping2)1036swap(mapping1, mapping2);1037if (mapping1)1038down_write(&mapping1->invalidate_lock);1039if (mapping2 && mapping1 != mapping2)1040down_write_nested(&mapping2->invalidate_lock, 1);1041}1042EXPORT_SYMBOL(filemap_invalidate_lock_two);10431044/*1045* filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings1046*1047* Unlock exclusive invalidate_lock of any passed mapping that is not NULL.1048*1049* @mapping1: the first mapping to unlock1050* @mapping2: the second mapping to unlock1051*/1052void filemap_invalidate_unlock_two(struct address_space *mapping1,1053struct address_space *mapping2)1054{1055if (mapping1)1056up_write(&mapping1->invalidate_lock);1057if (mapping2 && mapping1 != mapping2)1058up_write(&mapping2->invalidate_lock);1059}1060EXPORT_SYMBOL(filemap_invalidate_unlock_two);10611062/*1063* In order to wait for pages to become available there must be1064* waitqueues associated with pages. By using a hash table of1065* waitqueues where the bucket discipline is to maintain all1066* waiters on the same queue and wake all when any of the pages1067* become available, and for the woken contexts to check to be1068* sure the appropriate page became available, this saves space1069* at a cost of "thundering herd" phenomena during rare hash1070* collisions.1071*/1072#define PAGE_WAIT_TABLE_BITS 81073#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)1074static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;10751076static wait_queue_head_t *folio_waitqueue(struct folio *folio)1077{1078return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];1079}10801081/* How many times do we accept lock stealing from under a waiter? */1082static int sysctl_page_lock_unfairness = 5;1083static const struct ctl_table filemap_sysctl_table[] = {1084{1085.procname = "page_lock_unfairness",1086.data = &sysctl_page_lock_unfairness,1087.maxlen = sizeof(sysctl_page_lock_unfairness),1088.mode = 0644,1089.proc_handler = proc_dointvec_minmax,1090.extra1 = SYSCTL_ZERO,1091}1092};10931094void __init pagecache_init(void)1095{1096int i;10971098for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)1099init_waitqueue_head(&folio_wait_table[i]);11001101page_writeback_init();1102register_sysctl_init("vm", filemap_sysctl_table);1103}11041105/*1106* The page wait code treats the "wait->flags" somewhat unusually, because1107* we have multiple different kinds of waits, not just the usual "exclusive"1108* one.1109*1110* We have:1111*1112* (a) no special bits set:1113*1114* We're just waiting for the bit to be released, and when a waker1115* calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,1116* and remove it from the wait queue.1117*1118* Simple and straightforward.1119*1120* (b) WQ_FLAG_EXCLUSIVE:1121*1122* The waiter is waiting to get the lock, and only one waiter should1123* be woken up to avoid any thundering herd behavior. We'll set the1124* WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.1125*1126* This is the traditional exclusive wait.1127*1128* (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:1129*1130* The waiter is waiting to get the bit, and additionally wants the1131* lock to be transferred to it for fair lock behavior. If the lock1132* cannot be taken, we stop walking the wait queue without waking1133* the waiter.1134*1135* This is the "fair lock handoff" case, and in addition to setting1136* WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see1137* that it now has the lock.1138*/1139static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)1140{1141unsigned int flags;1142struct wait_page_key *key = arg;1143struct wait_page_queue *wait_page1144= container_of(wait, struct wait_page_queue, wait);11451146if (!wake_page_match(wait_page, key))1147return 0;11481149/*1150* If it's a lock handoff wait, we get the bit for it, and1151* stop walking (and do not wake it up) if we can't.1152*/1153flags = wait->flags;1154if (flags & WQ_FLAG_EXCLUSIVE) {1155if (test_bit(key->bit_nr, &key->folio->flags.f))1156return -1;1157if (flags & WQ_FLAG_CUSTOM) {1158if (test_and_set_bit(key->bit_nr, &key->folio->flags.f))1159return -1;1160flags |= WQ_FLAG_DONE;1161}1162}11631164/*1165* We are holding the wait-queue lock, but the waiter that1166* is waiting for this will be checking the flags without1167* any locking.1168*1169* So update the flags atomically, and wake up the waiter1170* afterwards to avoid any races. This store-release pairs1171* with the load-acquire in folio_wait_bit_common().1172*/1173smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);1174wake_up_state(wait->private, mode);11751176/*1177* Ok, we have successfully done what we're waiting for,1178* and we can unconditionally remove the wait entry.1179*1180* Note that this pairs with the "finish_wait()" in the1181* waiter, and has to be the absolute last thing we do.1182* After this list_del_init(&wait->entry) the wait entry1183* might be de-allocated and the process might even have1184* exited.1185*/1186list_del_init_careful(&wait->entry);1187return (flags & WQ_FLAG_EXCLUSIVE) != 0;1188}11891190static void folio_wake_bit(struct folio *folio, int bit_nr)1191{1192wait_queue_head_t *q = folio_waitqueue(folio);1193struct wait_page_key key;1194unsigned long flags;11951196key.folio = folio;1197key.bit_nr = bit_nr;1198key.page_match = 0;11991200spin_lock_irqsave(&q->lock, flags);1201__wake_up_locked_key(q, TASK_NORMAL, &key);12021203/*1204* It's possible to miss clearing waiters here, when we woke our page1205* waiters, but the hashed waitqueue has waiters for other pages on it.1206* That's okay, it's a rare case. The next waker will clear it.1207*1208* Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,1209* other), the flag may be cleared in the course of freeing the page;1210* but that is not required for correctness.1211*/1212if (!waitqueue_active(q) || !key.page_match)1213folio_clear_waiters(folio);12141215spin_unlock_irqrestore(&q->lock, flags);1216}12171218/*1219* A choice of three behaviors for folio_wait_bit_common():1220*/1221enum behavior {1222EXCLUSIVE, /* Hold ref to page and take the bit when woken, like1223* __folio_lock() waiting on then setting PG_locked.1224*/1225SHARED, /* Hold ref to page and check the bit when woken, like1226* folio_wait_writeback() waiting on PG_writeback.1227*/1228DROP, /* Drop ref to page before wait, no check when woken,1229* like folio_put_wait_locked() on PG_locked.1230*/1231};12321233/*1234* Attempt to check (or get) the folio flag, and mark us done1235* if successful.1236*/1237static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,1238struct wait_queue_entry *wait)1239{1240if (wait->flags & WQ_FLAG_EXCLUSIVE) {1241if (test_and_set_bit(bit_nr, &folio->flags.f))1242return false;1243} else if (test_bit(bit_nr, &folio->flags.f))1244return false;12451246wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;1247return true;1248}12491250static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,1251int state, enum behavior behavior)1252{1253wait_queue_head_t *q = folio_waitqueue(folio);1254int unfairness = sysctl_page_lock_unfairness;1255struct wait_page_queue wait_page;1256wait_queue_entry_t *wait = &wait_page.wait;1257bool thrashing = false;1258unsigned long pflags;1259bool in_thrashing;12601261if (bit_nr == PG_locked &&1262!folio_test_uptodate(folio) && folio_test_workingset(folio)) {1263delayacct_thrashing_start(&in_thrashing);1264psi_memstall_enter(&pflags);1265thrashing = true;1266}12671268init_wait(wait);1269wait->func = wake_page_function;1270wait_page.folio = folio;1271wait_page.bit_nr = bit_nr;12721273repeat:1274wait->flags = 0;1275if (behavior == EXCLUSIVE) {1276wait->flags = WQ_FLAG_EXCLUSIVE;1277if (--unfairness < 0)1278wait->flags |= WQ_FLAG_CUSTOM;1279}12801281/*1282* Do one last check whether we can get the1283* page bit synchronously.1284*1285* Do the folio_set_waiters() marking before that1286* to let any waker we _just_ missed know they1287* need to wake us up (otherwise they'll never1288* even go to the slow case that looks at the1289* page queue), and add ourselves to the wait1290* queue if we need to sleep.1291*1292* This part needs to be done under the queue1293* lock to avoid races.1294*/1295spin_lock_irq(&q->lock);1296folio_set_waiters(folio);1297if (!folio_trylock_flag(folio, bit_nr, wait))1298__add_wait_queue_entry_tail(q, wait);1299spin_unlock_irq(&q->lock);13001301/*1302* From now on, all the logic will be based on1303* the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to1304* see whether the page bit testing has already1305* been done by the wake function.1306*1307* We can drop our reference to the folio.1308*/1309if (behavior == DROP)1310folio_put(folio);13111312/*1313* Note that until the "finish_wait()", or until1314* we see the WQ_FLAG_WOKEN flag, we need to1315* be very careful with the 'wait->flags', because1316* we may race with a waker that sets them.1317*/1318for (;;) {1319unsigned int flags;13201321set_current_state(state);13221323/* Loop until we've been woken or interrupted */1324flags = smp_load_acquire(&wait->flags);1325if (!(flags & WQ_FLAG_WOKEN)) {1326if (signal_pending_state(state, current))1327break;13281329io_schedule();1330continue;1331}13321333/* If we were non-exclusive, we're done */1334if (behavior != EXCLUSIVE)1335break;13361337/* If the waker got the lock for us, we're done */1338if (flags & WQ_FLAG_DONE)1339break;13401341/*1342* Otherwise, if we're getting the lock, we need to1343* try to get it ourselves.1344*1345* And if that fails, we'll have to retry this all.1346*/1347if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))1348goto repeat;13491350wait->flags |= WQ_FLAG_DONE;1351break;1352}13531354/*1355* If a signal happened, this 'finish_wait()' may remove the last1356* waiter from the wait-queues, but the folio waiters bit will remain1357* set. That's ok. The next wakeup will take care of it, and trying1358* to do it here would be difficult and prone to races.1359*/1360finish_wait(q, wait);13611362if (thrashing) {1363delayacct_thrashing_end(&in_thrashing);1364psi_memstall_leave(&pflags);1365}13661367/*1368* NOTE! The wait->flags weren't stable until we've done the1369* 'finish_wait()', and we could have exited the loop above due1370* to a signal, and had a wakeup event happen after the signal1371* test but before the 'finish_wait()'.1372*1373* So only after the finish_wait() can we reliably determine1374* if we got woken up or not, so we can now figure out the final1375* return value based on that state without races.1376*1377* Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive1378* waiter, but an exclusive one requires WQ_FLAG_DONE.1379*/1380if (behavior == EXCLUSIVE)1381return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;13821383return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;1384}13851386#ifdef CONFIG_MIGRATION1387/**1388* migration_entry_wait_on_locked - Wait for a migration entry to be removed1389* @entry: migration swap entry.1390* @ptl: already locked ptl. This function will drop the lock.1391*1392* Wait for a migration entry referencing the given page to be removed. This is1393* equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except1394* this can be called without taking a reference on the page. Instead this1395* should be called while holding the ptl for the migration entry referencing1396* the page.1397*1398* Returns after unlocking the ptl.1399*1400* This follows the same logic as folio_wait_bit_common() so see the comments1401* there.1402*/1403void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)1404__releases(ptl)1405{1406struct wait_page_queue wait_page;1407wait_queue_entry_t *wait = &wait_page.wait;1408bool thrashing = false;1409unsigned long pflags;1410bool in_thrashing;1411wait_queue_head_t *q;1412struct folio *folio = pfn_swap_entry_folio(entry);14131414q = folio_waitqueue(folio);1415if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {1416delayacct_thrashing_start(&in_thrashing);1417psi_memstall_enter(&pflags);1418thrashing = true;1419}14201421init_wait(wait);1422wait->func = wake_page_function;1423wait_page.folio = folio;1424wait_page.bit_nr = PG_locked;1425wait->flags = 0;14261427spin_lock_irq(&q->lock);1428folio_set_waiters(folio);1429if (!folio_trylock_flag(folio, PG_locked, wait))1430__add_wait_queue_entry_tail(q, wait);1431spin_unlock_irq(&q->lock);14321433/*1434* If a migration entry exists for the page the migration path must hold1435* a valid reference to the page, and it must take the ptl to remove the1436* migration entry. So the page is valid until the ptl is dropped.1437*/1438spin_unlock(ptl);14391440for (;;) {1441unsigned int flags;14421443set_current_state(TASK_UNINTERRUPTIBLE);14441445/* Loop until we've been woken or interrupted */1446flags = smp_load_acquire(&wait->flags);1447if (!(flags & WQ_FLAG_WOKEN)) {1448if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))1449break;14501451io_schedule();1452continue;1453}1454break;1455}14561457finish_wait(q, wait);14581459if (thrashing) {1460delayacct_thrashing_end(&in_thrashing);1461psi_memstall_leave(&pflags);1462}1463}1464#endif14651466void folio_wait_bit(struct folio *folio, int bit_nr)1467{1468folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);1469}1470EXPORT_SYMBOL(folio_wait_bit);14711472int folio_wait_bit_killable(struct folio *folio, int bit_nr)1473{1474return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);1475}1476EXPORT_SYMBOL(folio_wait_bit_killable);14771478/**1479* folio_put_wait_locked - Drop a reference and wait for it to be unlocked1480* @folio: The folio to wait for.1481* @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).1482*1483* The caller should hold a reference on @folio. They expect the page to1484* become unlocked relatively soon, but do not wish to hold up migration1485* (for example) by holding the reference while waiting for the folio to1486* come unlocked. After this function returns, the caller should not1487* dereference @folio.1488*1489* Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.1490*/1491static int folio_put_wait_locked(struct folio *folio, int state)1492{1493return folio_wait_bit_common(folio, PG_locked, state, DROP);1494}14951496/**1497* folio_unlock - Unlock a locked folio.1498* @folio: The folio.1499*1500* Unlocks the folio and wakes up any thread sleeping on the page lock.1501*1502* Context: May be called from interrupt or process context. May not be1503* called from NMI context.1504*/1505void folio_unlock(struct folio *folio)1506{1507/* Bit 7 allows x86 to check the byte's sign bit */1508BUILD_BUG_ON(PG_waiters != 7);1509BUILD_BUG_ON(PG_locked > 7);1510VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);1511if (folio_xor_flags_has_waiters(folio, 1 << PG_locked))1512folio_wake_bit(folio, PG_locked);1513}1514EXPORT_SYMBOL(folio_unlock);15151516/**1517* folio_end_read - End read on a folio.1518* @folio: The folio.1519* @success: True if all reads completed successfully.1520*1521* When all reads against a folio have completed, filesystems should1522* call this function to let the pagecache know that no more reads1523* are outstanding. This will unlock the folio and wake up any thread1524* sleeping on the lock. The folio will also be marked uptodate if all1525* reads succeeded.1526*1527* Context: May be called from interrupt or process context. May not be1528* called from NMI context.1529*/1530void folio_end_read(struct folio *folio, bool success)1531{1532unsigned long mask = 1 << PG_locked;15331534/* Must be in bottom byte for x86 to work */1535BUILD_BUG_ON(PG_uptodate > 7);1536VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);1537VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio);15381539if (likely(success))1540mask |= 1 << PG_uptodate;1541if (folio_xor_flags_has_waiters(folio, mask))1542folio_wake_bit(folio, PG_locked);1543}1544EXPORT_SYMBOL(folio_end_read);15451546/**1547* folio_end_private_2 - Clear PG_private_2 and wake any waiters.1548* @folio: The folio.1549*1550* Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for1551* it. The folio reference held for PG_private_2 being set is released.1552*1553* This is, for example, used when a netfs folio is being written to a local1554* disk cache, thereby allowing writes to the cache for the same folio to be1555* serialised.1556*/1557void folio_end_private_2(struct folio *folio)1558{1559VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);1560clear_bit_unlock(PG_private_2, folio_flags(folio, 0));1561folio_wake_bit(folio, PG_private_2);1562folio_put(folio);1563}1564EXPORT_SYMBOL(folio_end_private_2);15651566/**1567* folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.1568* @folio: The folio to wait on.1569*1570* Wait for PG_private_2 to be cleared on a folio.1571*/1572void folio_wait_private_2(struct folio *folio)1573{1574while (folio_test_private_2(folio))1575folio_wait_bit(folio, PG_private_2);1576}1577EXPORT_SYMBOL(folio_wait_private_2);15781579/**1580* folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.1581* @folio: The folio to wait on.1582*1583* Wait for PG_private_2 to be cleared on a folio or until a fatal signal is1584* received by the calling task.1585*1586* Return:1587* - 0 if successful.1588* - -EINTR if a fatal signal was encountered.1589*/1590int folio_wait_private_2_killable(struct folio *folio)1591{1592int ret = 0;15931594while (folio_test_private_2(folio)) {1595ret = folio_wait_bit_killable(folio, PG_private_2);1596if (ret < 0)1597break;1598}15991600return ret;1601}1602EXPORT_SYMBOL(folio_wait_private_2_killable);16031604static void filemap_end_dropbehind(struct folio *folio)1605{1606struct address_space *mapping = folio->mapping;16071608VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);16091610if (folio_test_writeback(folio) || folio_test_dirty(folio))1611return;1612if (!folio_test_clear_dropbehind(folio))1613return;1614if (mapping)1615folio_unmap_invalidate(mapping, folio, 0);1616}16171618/*1619* If folio was marked as dropbehind, then pages should be dropped when writeback1620* completes. Do that now. If we fail, it's likely because of a big folio -1621* just reset dropbehind for that case and latter completions should invalidate.1622*/1623void folio_end_dropbehind(struct folio *folio)1624{1625if (!folio_test_dropbehind(folio))1626return;16271628/*1629* Hitting !in_task() should not happen off RWF_DONTCACHE writeback,1630* but can happen if normal writeback just happens to find dirty folios1631* that were created as part of uncached writeback, and that writeback1632* would otherwise not need non-IRQ handling. Just skip the1633* invalidation in that case.1634*/1635if (in_task() && folio_trylock(folio)) {1636filemap_end_dropbehind(folio);1637folio_unlock(folio);1638}1639}1640EXPORT_SYMBOL_GPL(folio_end_dropbehind);16411642/**1643* folio_end_writeback_no_dropbehind - End writeback against a folio.1644* @folio: The folio.1645*1646* The folio must actually be under writeback.1647* This call is intended for filesystems that need to defer dropbehind.1648*1649* Context: May be called from process or interrupt context.1650*/1651void folio_end_writeback_no_dropbehind(struct folio *folio)1652{1653VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);16541655/*1656* folio_test_clear_reclaim() could be used here but it is an1657* atomic operation and overkill in this particular case. Failing1658* to shuffle a folio marked for immediate reclaim is too mild1659* a gain to justify taking an atomic operation penalty at the1660* end of every folio writeback.1661*/1662if (folio_test_reclaim(folio)) {1663folio_clear_reclaim(folio);1664folio_rotate_reclaimable(folio);1665}16661667if (__folio_end_writeback(folio))1668folio_wake_bit(folio, PG_writeback);16691670acct_reclaim_writeback(folio);1671}1672EXPORT_SYMBOL_GPL(folio_end_writeback_no_dropbehind);16731674/**1675* folio_end_writeback - End writeback against a folio.1676* @folio: The folio.1677*1678* The folio must actually be under writeback.1679*1680* Context: May be called from process or interrupt context.1681*/1682void folio_end_writeback(struct folio *folio)1683{1684VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);16851686/*1687* Writeback does not hold a folio reference of its own, relying1688* on truncation to wait for the clearing of PG_writeback.1689* But here we must make sure that the folio is not freed and1690* reused before the folio_wake_bit().1691*/1692folio_get(folio);1693folio_end_writeback_no_dropbehind(folio);1694folio_end_dropbehind(folio);1695folio_put(folio);1696}1697EXPORT_SYMBOL(folio_end_writeback);16981699/**1700* __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.1701* @folio: The folio to lock1702*/1703void __folio_lock(struct folio *folio)1704{1705folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,1706EXCLUSIVE);1707}1708EXPORT_SYMBOL(__folio_lock);17091710int __folio_lock_killable(struct folio *folio)1711{1712return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,1713EXCLUSIVE);1714}1715EXPORT_SYMBOL_GPL(__folio_lock_killable);17161717static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)1718{1719struct wait_queue_head *q = folio_waitqueue(folio);1720int ret;17211722wait->folio = folio;1723wait->bit_nr = PG_locked;17241725spin_lock_irq(&q->lock);1726__add_wait_queue_entry_tail(q, &wait->wait);1727folio_set_waiters(folio);1728ret = !folio_trylock(folio);1729/*1730* If we were successful now, we know we're still on the1731* waitqueue as we're still under the lock. This means it's1732* safe to remove and return success, we know the callback1733* isn't going to trigger.1734*/1735if (!ret)1736__remove_wait_queue(q, &wait->wait);1737else1738ret = -EIOCBQUEUED;1739spin_unlock_irq(&q->lock);1740return ret;1741}17421743/*1744* Return values:1745* 0 - folio is locked.1746* non-zero - folio is not locked.1747* mmap_lock or per-VMA lock has been released (mmap_read_unlock() or1748* vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and1749* FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held.1750*1751* If neither ALLOW_RETRY nor KILLABLE are set, will always return 01752* with the folio locked and the mmap_lock/per-VMA lock is left unperturbed.1753*/1754vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)1755{1756unsigned int flags = vmf->flags;17571758if (fault_flag_allow_retry_first(flags)) {1759/*1760* CAUTION! In this case, mmap_lock/per-VMA lock is not1761* released even though returning VM_FAULT_RETRY.1762*/1763if (flags & FAULT_FLAG_RETRY_NOWAIT)1764return VM_FAULT_RETRY;17651766release_fault_lock(vmf);1767if (flags & FAULT_FLAG_KILLABLE)1768folio_wait_locked_killable(folio);1769else1770folio_wait_locked(folio);1771return VM_FAULT_RETRY;1772}1773if (flags & FAULT_FLAG_KILLABLE) {1774bool ret;17751776ret = __folio_lock_killable(folio);1777if (ret) {1778release_fault_lock(vmf);1779return VM_FAULT_RETRY;1780}1781} else {1782__folio_lock(folio);1783}17841785return 0;1786}17871788/**1789* page_cache_next_miss() - Find the next gap in the page cache.1790* @mapping: Mapping.1791* @index: Index.1792* @max_scan: Maximum range to search.1793*1794* Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the1795* gap with the lowest index.1796*1797* This function may be called under the rcu_read_lock. However, this will1798* not atomically search a snapshot of the cache at a single point in time.1799* For example, if a gap is created at index 5, then subsequently a gap is1800* created at index 10, page_cache_next_miss covering both indices may1801* return 10 if called under the rcu_read_lock.1802*1803* Return: The index of the gap if found, otherwise an index outside the1804* range specified (in which case 'return - index >= max_scan' will be true).1805* In the rare case of index wrap-around, 0 will be returned.1806*/1807pgoff_t page_cache_next_miss(struct address_space *mapping,1808pgoff_t index, unsigned long max_scan)1809{1810XA_STATE(xas, &mapping->i_pages, index);1811unsigned long nr = max_scan;18121813while (nr--) {1814void *entry = xas_next(&xas);1815if (!entry || xa_is_value(entry))1816return xas.xa_index;1817if (xas.xa_index == 0)1818return 0;1819}18201821return index + max_scan;1822}1823EXPORT_SYMBOL(page_cache_next_miss);18241825/**1826* page_cache_prev_miss() - Find the previous gap in the page cache.1827* @mapping: Mapping.1828* @index: Index.1829* @max_scan: Maximum range to search.1830*1831* Search the range [max(index - max_scan + 1, 0), index] for the1832* gap with the highest index.1833*1834* This function may be called under the rcu_read_lock. However, this will1835* not atomically search a snapshot of the cache at a single point in time.1836* For example, if a gap is created at index 10, then subsequently a gap is1837* created at index 5, page_cache_prev_miss() covering both indices may1838* return 5 if called under the rcu_read_lock.1839*1840* Return: The index of the gap if found, otherwise an index outside the1841* range specified (in which case 'index - return >= max_scan' will be true).1842* In the rare case of wrap-around, ULONG_MAX will be returned.1843*/1844pgoff_t page_cache_prev_miss(struct address_space *mapping,1845pgoff_t index, unsigned long max_scan)1846{1847XA_STATE(xas, &mapping->i_pages, index);18481849while (max_scan--) {1850void *entry = xas_prev(&xas);1851if (!entry || xa_is_value(entry))1852break;1853if (xas.xa_index == ULONG_MAX)1854break;1855}18561857return xas.xa_index;1858}1859EXPORT_SYMBOL(page_cache_prev_miss);18601861/*1862* Lockless page cache protocol:1863* On the lookup side:1864* 1. Load the folio from i_pages1865* 2. Increment the refcount if it's not zero1866* 3. If the folio is not found by xas_reload(), put the refcount and retry1867*1868* On the removal side:1869* A. Freeze the page (by zeroing the refcount if nobody else has a reference)1870* B. Remove the page from i_pages1871* C. Return the page to the page allocator1872*1873* This means that any page may have its reference count temporarily1874* increased by a speculative page cache (or GUP-fast) lookup as it can1875* be allocated by another user before the RCU grace period expires.1876* Because the refcount temporarily acquired here may end up being the1877* last refcount on the page, any page allocation must be freeable by1878* folio_put().1879*/18801881/*1882* filemap_get_entry - Get a page cache entry.1883* @mapping: the address_space to search1884* @index: The page cache index.1885*1886* Looks up the page cache entry at @mapping & @index. If it is a folio,1887* it is returned with an increased refcount. If it is a shadow entry1888* of a previously evicted folio, or a swap entry from shmem/tmpfs,1889* it is returned without further action.1890*1891* Return: The folio, swap or shadow entry, %NULL if nothing is found.1892*/1893void *filemap_get_entry(struct address_space *mapping, pgoff_t index)1894{1895XA_STATE(xas, &mapping->i_pages, index);1896struct folio *folio;18971898rcu_read_lock();1899repeat:1900xas_reset(&xas);1901folio = xas_load(&xas);1902if (xas_retry(&xas, folio))1903goto repeat;1904/*1905* A shadow entry of a recently evicted page, or a swap entry from1906* shmem/tmpfs. Return it without attempting to raise page count.1907*/1908if (!folio || xa_is_value(folio))1909goto out;19101911if (!folio_try_get(folio))1912goto repeat;19131914if (unlikely(folio != xas_reload(&xas))) {1915folio_put(folio);1916goto repeat;1917}1918out:1919rcu_read_unlock();19201921return folio;1922}19231924/**1925* __filemap_get_folio - Find and get a reference to a folio.1926* @mapping: The address_space to search.1927* @index: The page index.1928* @fgp_flags: %FGP flags modify how the folio is returned.1929* @gfp: Memory allocation flags to use if %FGP_CREAT is specified.1930*1931* Looks up the page cache entry at @mapping & @index.1932*1933* If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even1934* if the %GFP flags specified for %FGP_CREAT are atomic.1935*1936* If this function returns a folio, it is returned with an increased refcount.1937*1938* Return: The found folio or an ERR_PTR() otherwise.1939*/1940struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,1941fgf_t fgp_flags, gfp_t gfp)1942{1943struct folio *folio;19441945repeat:1946folio = filemap_get_entry(mapping, index);1947if (xa_is_value(folio))1948folio = NULL;1949if (!folio)1950goto no_page;19511952if (fgp_flags & FGP_LOCK) {1953if (fgp_flags & FGP_NOWAIT) {1954if (!folio_trylock(folio)) {1955folio_put(folio);1956return ERR_PTR(-EAGAIN);1957}1958} else {1959folio_lock(folio);1960}19611962/* Has the page been truncated? */1963if (unlikely(folio->mapping != mapping)) {1964folio_unlock(folio);1965folio_put(folio);1966goto repeat;1967}1968VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);1969}19701971if (fgp_flags & FGP_ACCESSED)1972folio_mark_accessed(folio);1973else if (fgp_flags & FGP_WRITE) {1974/* Clear idle flag for buffer write */1975if (folio_test_idle(folio))1976folio_clear_idle(folio);1977}19781979if (fgp_flags & FGP_STABLE)1980folio_wait_stable(folio);1981no_page:1982if (!folio && (fgp_flags & FGP_CREAT)) {1983unsigned int min_order = mapping_min_folio_order(mapping);1984unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags));1985int err;1986index = mapping_align_index(mapping, index);19871988if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))1989gfp |= __GFP_WRITE;1990if (fgp_flags & FGP_NOFS)1991gfp &= ~__GFP_FS;1992if (fgp_flags & FGP_NOWAIT) {1993gfp &= ~GFP_KERNEL;1994gfp |= GFP_NOWAIT;1995}1996if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))1997fgp_flags |= FGP_LOCK;19981999if (order > mapping_max_folio_order(mapping))2000order = mapping_max_folio_order(mapping);2001/* If we're not aligned, allocate a smaller folio */2002if (index & ((1UL << order) - 1))2003order = __ffs(index);20042005do {2006gfp_t alloc_gfp = gfp;20072008err = -ENOMEM;2009if (order > min_order)2010alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;2011folio = filemap_alloc_folio(alloc_gfp, order);2012if (!folio)2013continue;20142015/* Init accessed so avoid atomic mark_page_accessed later */2016if (fgp_flags & FGP_ACCESSED)2017__folio_set_referenced(folio);2018if (fgp_flags & FGP_DONTCACHE)2019__folio_set_dropbehind(folio);20202021err = filemap_add_folio(mapping, folio, index, gfp);2022if (!err)2023break;2024folio_put(folio);2025folio = NULL;2026} while (order-- > min_order);20272028if (err == -EEXIST)2029goto repeat;2030if (err) {2031/*2032* When NOWAIT I/O fails to allocate folios this could2033* be due to a nonblocking memory allocation and not2034* because the system actually is out of memory.2035* Return -EAGAIN so that there caller retries in a2036* blocking fashion instead of propagating -ENOMEM2037* to the application.2038*/2039if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM)2040err = -EAGAIN;2041return ERR_PTR(err);2042}2043/*2044* filemap_add_folio locks the page, and for mmap2045* we expect an unlocked page.2046*/2047if (folio && (fgp_flags & FGP_FOR_MMAP))2048folio_unlock(folio);2049}20502051if (!folio)2052return ERR_PTR(-ENOENT);2053/* not an uncached lookup, clear uncached if set */2054if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE))2055folio_clear_dropbehind(folio);2056return folio;2057}2058EXPORT_SYMBOL(__filemap_get_folio);20592060static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,2061xa_mark_t mark)2062{2063struct folio *folio;20642065retry:2066if (mark == XA_PRESENT)2067folio = xas_find(xas, max);2068else2069folio = xas_find_marked(xas, max, mark);20702071if (xas_retry(xas, folio))2072goto retry;2073/*2074* A shadow entry of a recently evicted page, a swap2075* entry from shmem/tmpfs or a DAX entry. Return it2076* without attempting to raise page count.2077*/2078if (!folio || xa_is_value(folio))2079return folio;20802081if (!folio_try_get(folio))2082goto reset;20832084if (unlikely(folio != xas_reload(xas))) {2085folio_put(folio);2086goto reset;2087}20882089return folio;2090reset:2091xas_reset(xas);2092goto retry;2093}20942095/**2096* find_get_entries - gang pagecache lookup2097* @mapping: The address_space to search2098* @start: The starting page cache index2099* @end: The final page index (inclusive).2100* @fbatch: Where the resulting entries are placed.2101* @indices: The cache indices corresponding to the entries in @entries2102*2103* find_get_entries() will search for and return a batch of entries in2104* the mapping. The entries are placed in @fbatch. find_get_entries()2105* takes a reference on any actual folios it returns.2106*2107* The entries have ascending indexes. The indices may not be consecutive2108* due to not-present entries or large folios.2109*2110* Any shadow entries of evicted folios, or swap entries from2111* shmem/tmpfs, are included in the returned array.2112*2113* Return: The number of entries which were found.2114*/2115unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,2116pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)2117{2118XA_STATE(xas, &mapping->i_pages, *start);2119struct folio *folio;21202121rcu_read_lock();2122while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {2123indices[fbatch->nr] = xas.xa_index;2124if (!folio_batch_add(fbatch, folio))2125break;2126}21272128if (folio_batch_count(fbatch)) {2129unsigned long nr;2130int idx = folio_batch_count(fbatch) - 1;21312132folio = fbatch->folios[idx];2133if (!xa_is_value(folio))2134nr = folio_nr_pages(folio);2135else2136nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]);2137*start = round_down(indices[idx] + nr, nr);2138}2139rcu_read_unlock();21402141return folio_batch_count(fbatch);2142}21432144/**2145* find_lock_entries - Find a batch of pagecache entries.2146* @mapping: The address_space to search.2147* @start: The starting page cache index.2148* @end: The final page index (inclusive).2149* @fbatch: Where the resulting entries are placed.2150* @indices: The cache indices of the entries in @fbatch.2151*2152* find_lock_entries() will return a batch of entries from @mapping.2153* Swap, shadow and DAX entries are included. Folios are returned2154* locked and with an incremented refcount. Folios which are locked2155* by somebody else or under writeback are skipped. Folios which are2156* partially outside the range are not returned.2157*2158* The entries have ascending indexes. The indices may not be consecutive2159* due to not-present entries, large folios, folios which could not be2160* locked or folios under writeback.2161*2162* Return: The number of entries which were found.2163*/2164unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,2165pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)2166{2167XA_STATE(xas, &mapping->i_pages, *start);2168struct folio *folio;21692170rcu_read_lock();2171while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {2172unsigned long base;2173unsigned long nr;21742175if (!xa_is_value(folio)) {2176nr = folio_nr_pages(folio);2177base = folio->index;2178/* Omit large folio which begins before the start */2179if (base < *start)2180goto put;2181/* Omit large folio which extends beyond the end */2182if (base + nr - 1 > end)2183goto put;2184if (!folio_trylock(folio))2185goto put;2186if (folio->mapping != mapping ||2187folio_test_writeback(folio))2188goto unlock;2189VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),2190folio);2191} else {2192nr = 1 << xas_get_order(&xas);2193base = xas.xa_index & ~(nr - 1);2194/* Omit order>0 value which begins before the start */2195if (base < *start)2196continue;2197/* Omit order>0 value which extends beyond the end */2198if (base + nr - 1 > end)2199break;2200}22012202/* Update start now so that last update is correct on return */2203*start = base + nr;2204indices[fbatch->nr] = xas.xa_index;2205if (!folio_batch_add(fbatch, folio))2206break;2207continue;2208unlock:2209folio_unlock(folio);2210put:2211folio_put(folio);2212}2213rcu_read_unlock();22142215return folio_batch_count(fbatch);2216}22172218/**2219* filemap_get_folios - Get a batch of folios2220* @mapping: The address_space to search2221* @start: The starting page index2222* @end: The final page index (inclusive)2223* @fbatch: The batch to fill.2224*2225* Search for and return a batch of folios in the mapping starting at2226* index @start and up to index @end (inclusive). The folios are returned2227* in @fbatch with an elevated reference count.2228*2229* Return: The number of folios which were found.2230* We also update @start to index the next folio for the traversal.2231*/2232unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,2233pgoff_t end, struct folio_batch *fbatch)2234{2235return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch);2236}2237EXPORT_SYMBOL(filemap_get_folios);22382239/**2240* filemap_get_folios_contig - Get a batch of contiguous folios2241* @mapping: The address_space to search2242* @start: The starting page index2243* @end: The final page index (inclusive)2244* @fbatch: The batch to fill2245*2246* filemap_get_folios_contig() works exactly like filemap_get_folios(),2247* except the returned folios are guaranteed to be contiguous. This may2248* not return all contiguous folios if the batch gets filled up.2249*2250* Return: The number of folios found.2251* Also update @start to be positioned for traversal of the next folio.2252*/22532254unsigned filemap_get_folios_contig(struct address_space *mapping,2255pgoff_t *start, pgoff_t end, struct folio_batch *fbatch)2256{2257XA_STATE(xas, &mapping->i_pages, *start);2258unsigned long nr;2259struct folio *folio;22602261rcu_read_lock();22622263for (folio = xas_load(&xas); folio && xas.xa_index <= end;2264folio = xas_next(&xas)) {2265if (xas_retry(&xas, folio))2266continue;2267/*2268* If the entry has been swapped out, we can stop looking.2269* No current caller is looking for DAX entries.2270*/2271if (xa_is_value(folio))2272goto update_start;22732274/* If we landed in the middle of a THP, continue at its end. */2275if (xa_is_sibling(folio))2276goto update_start;22772278if (!folio_try_get(folio))2279goto retry;22802281if (unlikely(folio != xas_reload(&xas)))2282goto put_folio;22832284if (!folio_batch_add(fbatch, folio)) {2285nr = folio_nr_pages(folio);2286*start = folio->index + nr;2287goto out;2288}2289xas_advance(&xas, folio_next_index(folio) - 1);2290continue;2291put_folio:2292folio_put(folio);22932294retry:2295xas_reset(&xas);2296}22972298update_start:2299nr = folio_batch_count(fbatch);23002301if (nr) {2302folio = fbatch->folios[nr - 1];2303*start = folio_next_index(folio);2304}2305out:2306rcu_read_unlock();2307return folio_batch_count(fbatch);2308}2309EXPORT_SYMBOL(filemap_get_folios_contig);23102311/**2312* filemap_get_folios_tag - Get a batch of folios matching @tag2313* @mapping: The address_space to search2314* @start: The starting page index2315* @end: The final page index (inclusive)2316* @tag: The tag index2317* @fbatch: The batch to fill2318*2319* The first folio may start before @start; if it does, it will contain2320* @start. The final folio may extend beyond @end; if it does, it will2321* contain @end. The folios have ascending indices. There may be gaps2322* between the folios if there are indices which have no folio in the2323* page cache. If folios are added to or removed from the page cache2324* while this is running, they may or may not be found by this call.2325* Only returns folios that are tagged with @tag.2326*2327* Return: The number of folios found.2328* Also update @start to index the next folio for traversal.2329*/2330unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,2331pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch)2332{2333XA_STATE(xas, &mapping->i_pages, *start);2334struct folio *folio;23352336rcu_read_lock();2337while ((folio = find_get_entry(&xas, end, tag)) != NULL) {2338/*2339* Shadow entries should never be tagged, but this iteration2340* is lockless so there is a window for page reclaim to evict2341* a page we saw tagged. Skip over it.2342*/2343if (xa_is_value(folio))2344continue;2345if (!folio_batch_add(fbatch, folio)) {2346unsigned long nr = folio_nr_pages(folio);2347*start = folio->index + nr;2348goto out;2349}2350}2351/*2352* We come here when there is no page beyond @end. We take care to not2353* overflow the index @start as it confuses some of the callers. This2354* breaks the iteration when there is a page at index -1 but that is2355* already broke anyway.2356*/2357if (end == (pgoff_t)-1)2358*start = (pgoff_t)-1;2359else2360*start = end + 1;2361out:2362rcu_read_unlock();23632364return folio_batch_count(fbatch);2365}2366EXPORT_SYMBOL(filemap_get_folios_tag);23672368/*2369* CD/DVDs are error prone. When a medium error occurs, the driver may fail2370* a _large_ part of the i/o request. Imagine the worst scenario:2371*2372* ---R__________________________________________B__________2373* ^ reading here ^ bad block(assume 4k)2374*2375* read(R) => miss => readahead(R...B) => media error => frustrating retries2376* => failing the whole request => read(R) => read(R+1) =>2377* readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>2378* readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>2379* readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......2380*2381* It is going insane. Fix it by quickly scaling down the readahead size.2382*/2383static void shrink_readahead_size_eio(struct file_ra_state *ra)2384{2385ra->ra_pages /= 4;2386}23872388/*2389* filemap_get_read_batch - Get a batch of folios for read2390*2391* Get a batch of folios which represent a contiguous range of bytes in2392* the file. No exceptional entries will be returned. If @index is in2393* the middle of a folio, the entire folio will be returned. The last2394* folio in the batch may have the readahead flag set or the uptodate flag2395* clear so that the caller can take the appropriate action.2396*/2397static void filemap_get_read_batch(struct address_space *mapping,2398pgoff_t index, pgoff_t max, struct folio_batch *fbatch)2399{2400XA_STATE(xas, &mapping->i_pages, index);2401struct folio *folio;24022403rcu_read_lock();2404for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {2405if (xas_retry(&xas, folio))2406continue;2407if (xas.xa_index > max || xa_is_value(folio))2408break;2409if (xa_is_sibling(folio))2410break;2411if (!folio_try_get(folio))2412goto retry;24132414if (unlikely(folio != xas_reload(&xas)))2415goto put_folio;24162417if (!folio_batch_add(fbatch, folio))2418break;2419if (!folio_test_uptodate(folio))2420break;2421if (folio_test_readahead(folio))2422break;2423xas_advance(&xas, folio_next_index(folio) - 1);2424continue;2425put_folio:2426folio_put(folio);2427retry:2428xas_reset(&xas);2429}2430rcu_read_unlock();2431}24322433static int filemap_read_folio(struct file *file, filler_t filler,2434struct folio *folio)2435{2436bool workingset = folio_test_workingset(folio);2437unsigned long pflags;2438int error;24392440/* Start the actual read. The read will unlock the page. */2441if (unlikely(workingset))2442psi_memstall_enter(&pflags);2443error = filler(file, folio);2444if (unlikely(workingset))2445psi_memstall_leave(&pflags);2446if (error)2447return error;24482449error = folio_wait_locked_killable(folio);2450if (error)2451return error;2452if (folio_test_uptodate(folio))2453return 0;2454if (file)2455shrink_readahead_size_eio(&file->f_ra);2456return -EIO;2457}24582459static bool filemap_range_uptodate(struct address_space *mapping,2460loff_t pos, size_t count, struct folio *folio,2461bool need_uptodate)2462{2463if (folio_test_uptodate(folio))2464return true;2465/* pipes can't handle partially uptodate pages */2466if (need_uptodate)2467return false;2468if (!mapping->a_ops->is_partially_uptodate)2469return false;2470if (mapping->host->i_blkbits >= folio_shift(folio))2471return false;24722473if (folio_pos(folio) > pos) {2474count -= folio_pos(folio) - pos;2475pos = 0;2476} else {2477pos -= folio_pos(folio);2478}24792480if (pos == 0 && count >= folio_size(folio))2481return false;24822483return mapping->a_ops->is_partially_uptodate(folio, pos, count);2484}24852486static int filemap_update_page(struct kiocb *iocb,2487struct address_space *mapping, size_t count,2488struct folio *folio, bool need_uptodate)2489{2490int error;24912492if (iocb->ki_flags & IOCB_NOWAIT) {2493if (!filemap_invalidate_trylock_shared(mapping))2494return -EAGAIN;2495} else {2496filemap_invalidate_lock_shared(mapping);2497}24982499if (!folio_trylock(folio)) {2500error = -EAGAIN;2501if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))2502goto unlock_mapping;2503if (!(iocb->ki_flags & IOCB_WAITQ)) {2504filemap_invalidate_unlock_shared(mapping);2505/*2506* This is where we usually end up waiting for a2507* previously submitted readahead to finish.2508*/2509folio_put_wait_locked(folio, TASK_KILLABLE);2510return AOP_TRUNCATED_PAGE;2511}2512error = __folio_lock_async(folio, iocb->ki_waitq);2513if (error)2514goto unlock_mapping;2515}25162517error = AOP_TRUNCATED_PAGE;2518if (!folio->mapping)2519goto unlock;25202521error = 0;2522if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio,2523need_uptodate))2524goto unlock;25252526error = -EAGAIN;2527if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))2528goto unlock;25292530error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,2531folio);2532goto unlock_mapping;2533unlock:2534folio_unlock(folio);2535unlock_mapping:2536filemap_invalidate_unlock_shared(mapping);2537if (error == AOP_TRUNCATED_PAGE)2538folio_put(folio);2539return error;2540}25412542static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch)2543{2544struct address_space *mapping = iocb->ki_filp->f_mapping;2545struct folio *folio;2546int error;2547unsigned int min_order = mapping_min_folio_order(mapping);2548pgoff_t index;25492550if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))2551return -EAGAIN;25522553folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order);2554if (!folio)2555return -ENOMEM;2556if (iocb->ki_flags & IOCB_DONTCACHE)2557__folio_set_dropbehind(folio);25582559/*2560* Protect against truncate / hole punch. Grabbing invalidate_lock2561* here assures we cannot instantiate and bring uptodate new2562* pagecache folios after evicting page cache during truncate2563* and before actually freeing blocks. Note that we could2564* release invalidate_lock after inserting the folio into2565* the page cache as the locked folio would then be enough to2566* synchronize with hole punching. But there are code paths2567* such as filemap_update_page() filling in partially uptodate2568* pages or ->readahead() that need to hold invalidate_lock2569* while mapping blocks for IO so let's hold the lock here as2570* well to keep locking rules simple.2571*/2572filemap_invalidate_lock_shared(mapping);2573index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order;2574error = filemap_add_folio(mapping, folio, index,2575mapping_gfp_constraint(mapping, GFP_KERNEL));2576if (error == -EEXIST)2577error = AOP_TRUNCATED_PAGE;2578if (error)2579goto error;25802581error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,2582folio);2583if (error)2584goto error;25852586filemap_invalidate_unlock_shared(mapping);2587folio_batch_add(fbatch, folio);2588return 0;2589error:2590filemap_invalidate_unlock_shared(mapping);2591folio_put(folio);2592return error;2593}25942595static int filemap_readahead(struct kiocb *iocb, struct file *file,2596struct address_space *mapping, struct folio *folio,2597pgoff_t last_index)2598{2599DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index);26002601if (iocb->ki_flags & IOCB_NOIO)2602return -EAGAIN;2603if (iocb->ki_flags & IOCB_DONTCACHE)2604ractl.dropbehind = 1;2605page_cache_async_ra(&ractl, folio, last_index - folio->index);2606return 0;2607}26082609static int filemap_get_pages(struct kiocb *iocb, size_t count,2610struct folio_batch *fbatch, bool need_uptodate)2611{2612struct file *filp = iocb->ki_filp;2613struct address_space *mapping = filp->f_mapping;2614pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;2615pgoff_t last_index;2616struct folio *folio;2617unsigned int flags;2618int err = 0;26192620/* "last_index" is the index of the folio beyond the end of the read */2621last_index = round_up(iocb->ki_pos + count,2622mapping_min_folio_nrbytes(mapping)) >> PAGE_SHIFT;2623retry:2624if (fatal_signal_pending(current))2625return -EINTR;26262627filemap_get_read_batch(mapping, index, last_index - 1, fbatch);2628if (!folio_batch_count(fbatch)) {2629DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index);26302631if (iocb->ki_flags & IOCB_NOIO)2632return -EAGAIN;2633if (iocb->ki_flags & IOCB_NOWAIT)2634flags = memalloc_noio_save();2635if (iocb->ki_flags & IOCB_DONTCACHE)2636ractl.dropbehind = 1;2637page_cache_sync_ra(&ractl, last_index - index);2638if (iocb->ki_flags & IOCB_NOWAIT)2639memalloc_noio_restore(flags);2640filemap_get_read_batch(mapping, index, last_index - 1, fbatch);2641}2642if (!folio_batch_count(fbatch)) {2643err = filemap_create_folio(iocb, fbatch);2644if (err == AOP_TRUNCATED_PAGE)2645goto retry;2646return err;2647}26482649folio = fbatch->folios[folio_batch_count(fbatch) - 1];2650if (folio_test_readahead(folio)) {2651err = filemap_readahead(iocb, filp, mapping, folio, last_index);2652if (err)2653goto err;2654}2655if (!folio_test_uptodate(folio)) {2656if (folio_batch_count(fbatch) > 1) {2657err = -EAGAIN;2658goto err;2659}2660err = filemap_update_page(iocb, mapping, count, folio,2661need_uptodate);2662if (err)2663goto err;2664}26652666trace_mm_filemap_get_pages(mapping, index, last_index - 1);2667return 0;2668err:2669if (err < 0)2670folio_put(folio);2671if (likely(--fbatch->nr))2672return 0;2673if (err == AOP_TRUNCATED_PAGE)2674goto retry;2675return err;2676}26772678static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)2679{2680unsigned int shift = folio_shift(folio);26812682return (pos1 >> shift == pos2 >> shift);2683}26842685static void filemap_end_dropbehind_read(struct folio *folio)2686{2687if (!folio_test_dropbehind(folio))2688return;2689if (folio_test_writeback(folio) || folio_test_dirty(folio))2690return;2691if (folio_trylock(folio)) {2692filemap_end_dropbehind(folio);2693folio_unlock(folio);2694}2695}26962697/**2698* filemap_read - Read data from the page cache.2699* @iocb: The iocb to read.2700* @iter: Destination for the data.2701* @already_read: Number of bytes already read by the caller.2702*2703* Copies data from the page cache. If the data is not currently present,2704* uses the readahead and read_folio address_space operations to fetch it.2705*2706* Return: Total number of bytes copied, including those already read by2707* the caller. If an error happens before any bytes are copied, returns2708* a negative error number.2709*/2710ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,2711ssize_t already_read)2712{2713struct file *filp = iocb->ki_filp;2714struct file_ra_state *ra = &filp->f_ra;2715struct address_space *mapping = filp->f_mapping;2716struct inode *inode = mapping->host;2717struct folio_batch fbatch;2718int i, error = 0;2719bool writably_mapped;2720loff_t isize, end_offset;2721loff_t last_pos = ra->prev_pos;27222723if (unlikely(iocb->ki_pos < 0))2724return -EINVAL;2725if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))2726return 0;2727if (unlikely(!iov_iter_count(iter)))2728return 0;27292730iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);2731folio_batch_init(&fbatch);27322733do {2734cond_resched();27352736/*2737* If we've already successfully copied some data, then we2738* can no longer safely return -EIOCBQUEUED. Hence mark2739* an async read NOWAIT at that point.2740*/2741if ((iocb->ki_flags & IOCB_WAITQ) && already_read)2742iocb->ki_flags |= IOCB_NOWAIT;27432744if (unlikely(iocb->ki_pos >= i_size_read(inode)))2745break;27462747error = filemap_get_pages(iocb, iter->count, &fbatch, false);2748if (error < 0)2749break;27502751/*2752* i_size must be checked after we know the pages are Uptodate.2753*2754* Checking i_size after the check allows us to calculate2755* the correct value for "nr", which means the zero-filled2756* part of the page is not copied back to userspace (unless2757* another truncate extends the file - this is desired though).2758*/2759isize = i_size_read(inode);2760if (unlikely(iocb->ki_pos >= isize))2761goto put_folios;2762end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);27632764/*2765* Once we start copying data, we don't want to be touching any2766* cachelines that might be contended:2767*/2768writably_mapped = mapping_writably_mapped(mapping);27692770/*2771* When a read accesses the same folio several times, only2772* mark it as accessed the first time.2773*/2774if (!pos_same_folio(iocb->ki_pos, last_pos - 1,2775fbatch.folios[0]))2776folio_mark_accessed(fbatch.folios[0]);27772778for (i = 0; i < folio_batch_count(&fbatch); i++) {2779struct folio *folio = fbatch.folios[i];2780size_t fsize = folio_size(folio);2781size_t offset = iocb->ki_pos & (fsize - 1);2782size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,2783fsize - offset);2784size_t copied;27852786if (end_offset < folio_pos(folio))2787break;2788if (i > 0)2789folio_mark_accessed(folio);2790/*2791* If users can be writing to this folio using arbitrary2792* virtual addresses, take care of potential aliasing2793* before reading the folio on the kernel side.2794*/2795if (writably_mapped)2796flush_dcache_folio(folio);27972798copied = copy_folio_to_iter(folio, offset, bytes, iter);27992800already_read += copied;2801iocb->ki_pos += copied;2802last_pos = iocb->ki_pos;28032804if (copied < bytes) {2805error = -EFAULT;2806break;2807}2808}2809put_folios:2810for (i = 0; i < folio_batch_count(&fbatch); i++) {2811struct folio *folio = fbatch.folios[i];28122813filemap_end_dropbehind_read(folio);2814folio_put(folio);2815}2816folio_batch_init(&fbatch);2817} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);28182819file_accessed(filp);2820ra->prev_pos = last_pos;2821return already_read ? already_read : error;2822}2823EXPORT_SYMBOL_GPL(filemap_read);28242825int kiocb_write_and_wait(struct kiocb *iocb, size_t count)2826{2827struct address_space *mapping = iocb->ki_filp->f_mapping;2828loff_t pos = iocb->ki_pos;2829loff_t end = pos + count - 1;28302831if (iocb->ki_flags & IOCB_NOWAIT) {2832if (filemap_range_needs_writeback(mapping, pos, end))2833return -EAGAIN;2834return 0;2835}28362837return filemap_write_and_wait_range(mapping, pos, end);2838}2839EXPORT_SYMBOL_GPL(kiocb_write_and_wait);28402841int filemap_invalidate_pages(struct address_space *mapping,2842loff_t pos, loff_t end, bool nowait)2843{2844int ret;28452846if (nowait) {2847/* we could block if there are any pages in the range */2848if (filemap_range_has_page(mapping, pos, end))2849return -EAGAIN;2850} else {2851ret = filemap_write_and_wait_range(mapping, pos, end);2852if (ret)2853return ret;2854}28552856/*2857* After a write we want buffered reads to be sure to go to disk to get2858* the new data. We invalidate clean cached page from the region we're2859* about to write. We do this *before* the write so that we can return2860* without clobbering -EIOCBQUEUED from ->direct_IO().2861*/2862return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,2863end >> PAGE_SHIFT);2864}28652866int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)2867{2868struct address_space *mapping = iocb->ki_filp->f_mapping;28692870return filemap_invalidate_pages(mapping, iocb->ki_pos,2871iocb->ki_pos + count - 1,2872iocb->ki_flags & IOCB_NOWAIT);2873}2874EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);28752876/**2877* generic_file_read_iter - generic filesystem read routine2878* @iocb: kernel I/O control block2879* @iter: destination for the data read2880*2881* This is the "read_iter()" routine for all filesystems2882* that can use the page cache directly.2883*2884* The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall2885* be returned when no data can be read without waiting for I/O requests2886* to complete; it doesn't prevent readahead.2887*2888* The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O2889* requests shall be made for the read or for readahead. When no data2890* can be read, -EAGAIN shall be returned. When readahead would be2891* triggered, a partial, possibly empty read shall be returned.2892*2893* Return:2894* * number of bytes copied, even for partial reads2895* * negative error code (or 0 if IOCB_NOIO) if nothing was read2896*/2897ssize_t2898generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)2899{2900size_t count = iov_iter_count(iter);2901ssize_t retval = 0;29022903if (!count)2904return 0; /* skip atime */29052906if (iocb->ki_flags & IOCB_DIRECT) {2907struct file *file = iocb->ki_filp;2908struct address_space *mapping = file->f_mapping;2909struct inode *inode = mapping->host;29102911retval = kiocb_write_and_wait(iocb, count);2912if (retval < 0)2913return retval;2914file_accessed(file);29152916retval = mapping->a_ops->direct_IO(iocb, iter);2917if (retval >= 0) {2918iocb->ki_pos += retval;2919count -= retval;2920}2921if (retval != -EIOCBQUEUED)2922iov_iter_revert(iter, count - iov_iter_count(iter));29232924/*2925* Btrfs can have a short DIO read if we encounter2926* compressed extents, so if there was an error, or if2927* we've already read everything we wanted to, or if2928* there was a short read because we hit EOF, go ahead2929* and return. Otherwise fallthrough to buffered io for2930* the rest of the read. Buffered reads will not work for2931* DAX files, so don't bother trying.2932*/2933if (retval < 0 || !count || IS_DAX(inode))2934return retval;2935if (iocb->ki_pos >= i_size_read(inode))2936return retval;2937}29382939return filemap_read(iocb, iter, retval);2940}2941EXPORT_SYMBOL(generic_file_read_iter);29422943/*2944* Splice subpages from a folio into a pipe.2945*/2946size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,2947struct folio *folio, loff_t fpos, size_t size)2948{2949struct page *page;2950size_t spliced = 0, offset = offset_in_folio(folio, fpos);29512952page = folio_page(folio, offset / PAGE_SIZE);2953size = min(size, folio_size(folio) - offset);2954offset %= PAGE_SIZE;29552956while (spliced < size && !pipe_is_full(pipe)) {2957struct pipe_buffer *buf = pipe_head_buf(pipe);2958size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);29592960*buf = (struct pipe_buffer) {2961.ops = &page_cache_pipe_buf_ops,2962.page = page,2963.offset = offset,2964.len = part,2965};2966folio_get(folio);2967pipe->head++;2968page++;2969spliced += part;2970offset = 0;2971}29722973return spliced;2974}29752976/**2977* filemap_splice_read - Splice data from a file's pagecache into a pipe2978* @in: The file to read from2979* @ppos: Pointer to the file position to read from2980* @pipe: The pipe to splice into2981* @len: The amount to splice2982* @flags: The SPLICE_F_* flags2983*2984* This function gets folios from a file's pagecache and splices them into the2985* pipe. Readahead will be called as necessary to fill more folios. This may2986* be used for blockdevs also.2987*2988* Return: On success, the number of bytes read will be returned and *@ppos2989* will be updated if appropriate; 0 will be returned if there is no more data2990* to be read; -EAGAIN will be returned if the pipe had no space, and some2991* other negative error code will be returned on error. A short read may occur2992* if the pipe has insufficient space, we reach the end of the data or we hit a2993* hole.2994*/2995ssize_t filemap_splice_read(struct file *in, loff_t *ppos,2996struct pipe_inode_info *pipe,2997size_t len, unsigned int flags)2998{2999struct folio_batch fbatch;3000struct kiocb iocb;3001size_t total_spliced = 0, used, npages;3002loff_t isize, end_offset;3003bool writably_mapped;3004int i, error = 0;30053006if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes))3007return 0;30083009init_sync_kiocb(&iocb, in);3010iocb.ki_pos = *ppos;30113012/* Work out how much data we can actually add into the pipe */3013used = pipe_buf_usage(pipe);3014npages = max_t(ssize_t, pipe->max_usage - used, 0);3015len = min_t(size_t, len, npages * PAGE_SIZE);30163017folio_batch_init(&fbatch);30183019do {3020cond_resched();30213022if (*ppos >= i_size_read(in->f_mapping->host))3023break;30243025iocb.ki_pos = *ppos;3026error = filemap_get_pages(&iocb, len, &fbatch, true);3027if (error < 0)3028break;30293030/*3031* i_size must be checked after we know the pages are Uptodate.3032*3033* Checking i_size after the check allows us to calculate3034* the correct value for "nr", which means the zero-filled3035* part of the page is not copied back to userspace (unless3036* another truncate extends the file - this is desired though).3037*/3038isize = i_size_read(in->f_mapping->host);3039if (unlikely(*ppos >= isize))3040break;3041end_offset = min_t(loff_t, isize, *ppos + len);30423043/*3044* Once we start copying data, we don't want to be touching any3045* cachelines that might be contended:3046*/3047writably_mapped = mapping_writably_mapped(in->f_mapping);30483049for (i = 0; i < folio_batch_count(&fbatch); i++) {3050struct folio *folio = fbatch.folios[i];3051size_t n;30523053if (folio_pos(folio) >= end_offset)3054goto out;3055folio_mark_accessed(folio);30563057/*3058* If users can be writing to this folio using arbitrary3059* virtual addresses, take care of potential aliasing3060* before reading the folio on the kernel side.3061*/3062if (writably_mapped)3063flush_dcache_folio(folio);30643065n = min_t(loff_t, len, isize - *ppos);3066n = splice_folio_into_pipe(pipe, folio, *ppos, n);3067if (!n)3068goto out;3069len -= n;3070total_spliced += n;3071*ppos += n;3072in->f_ra.prev_pos = *ppos;3073if (pipe_is_full(pipe))3074goto out;3075}30763077folio_batch_release(&fbatch);3078} while (len);30793080out:3081folio_batch_release(&fbatch);3082file_accessed(in);30833084return total_spliced ? total_spliced : error;3085}3086EXPORT_SYMBOL(filemap_splice_read);30873088static inline loff_t folio_seek_hole_data(struct xa_state *xas,3089struct address_space *mapping, struct folio *folio,3090loff_t start, loff_t end, bool seek_data)3091{3092const struct address_space_operations *ops = mapping->a_ops;3093size_t offset, bsz = i_blocksize(mapping->host);30943095if (xa_is_value(folio) || folio_test_uptodate(folio))3096return seek_data ? start : end;3097if (!ops->is_partially_uptodate)3098return seek_data ? end : start;30993100xas_pause(xas);3101rcu_read_unlock();3102folio_lock(folio);3103if (unlikely(folio->mapping != mapping))3104goto unlock;31053106offset = offset_in_folio(folio, start) & ~(bsz - 1);31073108do {3109if (ops->is_partially_uptodate(folio, offset, bsz) ==3110seek_data)3111break;3112start = (start + bsz) & ~((u64)bsz - 1);3113offset += bsz;3114} while (offset < folio_size(folio));3115unlock:3116folio_unlock(folio);3117rcu_read_lock();3118return start;3119}31203121static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)3122{3123if (xa_is_value(folio))3124return PAGE_SIZE << xas_get_order(xas);3125return folio_size(folio);3126}31273128/**3129* mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.3130* @mapping: Address space to search.3131* @start: First byte to consider.3132* @end: Limit of search (exclusive).3133* @whence: Either SEEK_HOLE or SEEK_DATA.3134*3135* If the page cache knows which blocks contain holes and which blocks3136* contain data, your filesystem can use this function to implement3137* SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are3138* entirely memory-based such as tmpfs, and filesystems which support3139* unwritten extents.3140*3141* Return: The requested offset on success, or -ENXIO if @whence specifies3142* SEEK_DATA and there is no data after @start. There is an implicit hole3143* after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start3144* and @end contain data.3145*/3146loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,3147loff_t end, int whence)3148{3149XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);3150pgoff_t max = (end - 1) >> PAGE_SHIFT;3151bool seek_data = (whence == SEEK_DATA);3152struct folio *folio;31533154if (end <= start)3155return -ENXIO;31563157rcu_read_lock();3158while ((folio = find_get_entry(&xas, max, XA_PRESENT))) {3159loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;3160size_t seek_size;31613162if (start < pos) {3163if (!seek_data)3164goto unlock;3165start = pos;3166}31673168seek_size = seek_folio_size(&xas, folio);3169pos = round_up((u64)pos + 1, seek_size);3170start = folio_seek_hole_data(&xas, mapping, folio, start, pos,3171seek_data);3172if (start < pos)3173goto unlock;3174if (start >= end)3175break;3176if (seek_size > PAGE_SIZE)3177xas_set(&xas, pos >> PAGE_SHIFT);3178if (!xa_is_value(folio))3179folio_put(folio);3180}3181if (seek_data)3182start = -ENXIO;3183unlock:3184rcu_read_unlock();3185if (folio && !xa_is_value(folio))3186folio_put(folio);3187if (start > end)3188return end;3189return start;3190}31913192#ifdef CONFIG_MMU3193#define MMAP_LOTSAMISS (100)3194/*3195* lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock3196* @vmf - the vm_fault for this fault.3197* @folio - the folio to lock.3198* @fpin - the pointer to the file we may pin (or is already pinned).3199*3200* This works similar to lock_folio_or_retry in that it can drop the3201* mmap_lock. It differs in that it actually returns the folio locked3202* if it returns 1 and 0 if it couldn't lock the folio. If we did have3203* to drop the mmap_lock then fpin will point to the pinned file and3204* needs to be fput()'ed at a later point.3205*/3206static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,3207struct file **fpin)3208{3209if (folio_trylock(folio))3210return 1;32113212/*3213* NOTE! This will make us return with VM_FAULT_RETRY, but with3214* the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT3215* is supposed to work. We have way too many special cases..3216*/3217if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)3218return 0;32193220*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);3221if (vmf->flags & FAULT_FLAG_KILLABLE) {3222if (__folio_lock_killable(folio)) {3223/*3224* We didn't have the right flags to drop the3225* fault lock, but all fault_handlers only check3226* for fatal signals if we return VM_FAULT_RETRY,3227* so we need to drop the fault lock here and3228* return 0 if we don't have a fpin.3229*/3230if (*fpin == NULL)3231release_fault_lock(vmf);3232return 0;3233}3234} else3235__folio_lock(folio);32363237return 1;3238}32393240/*3241* Synchronous readahead happens when we don't even find a page in the page3242* cache at all. We don't want to perform IO under the mmap sem, so if we have3243* to drop the mmap sem we return the file that was pinned in order for us to do3244* that. If we didn't pin a file then we return NULL. The file that is3245* returned needs to be fput()'ed when we're done with it.3246*/3247static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)3248{3249struct file *file = vmf->vma->vm_file;3250struct file_ra_state *ra = &file->f_ra;3251struct address_space *mapping = file->f_mapping;3252DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);3253struct file *fpin = NULL;3254vm_flags_t vm_flags = vmf->vma->vm_flags;3255unsigned short mmap_miss;32563257#ifdef CONFIG_TRANSPARENT_HUGEPAGE3258/* Use the readahead code, even if readahead is disabled */3259if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {3260fpin = maybe_unlock_mmap_for_io(vmf, fpin);3261ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);3262ra->size = HPAGE_PMD_NR;3263/*3264* Fetch two PMD folios, so we get the chance to actually3265* readahead, unless we've been told not to.3266*/3267if (!(vm_flags & VM_RAND_READ))3268ra->size *= 2;3269ra->async_size = HPAGE_PMD_NR;3270ra->order = HPAGE_PMD_ORDER;3271page_cache_ra_order(&ractl, ra);3272return fpin;3273}3274#endif32753276/*3277* If we don't want any read-ahead, don't bother. VM_EXEC case below is3278* already intended for random access.3279*/3280if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ)3281return fpin;3282if (!ra->ra_pages)3283return fpin;32843285if (vm_flags & VM_SEQ_READ) {3286fpin = maybe_unlock_mmap_for_io(vmf, fpin);3287page_cache_sync_ra(&ractl, ra->ra_pages);3288return fpin;3289}32903291/* Avoid banging the cache line if not needed */3292mmap_miss = READ_ONCE(ra->mmap_miss);3293if (mmap_miss < MMAP_LOTSAMISS * 10)3294WRITE_ONCE(ra->mmap_miss, ++mmap_miss);32953296/*3297* Do we miss much more than hit in this file? If so,3298* stop bothering with read-ahead. It will only hurt.3299*/3300if (mmap_miss > MMAP_LOTSAMISS)3301return fpin;33023303if (vm_flags & VM_EXEC) {3304/*3305* Allow arch to request a preferred minimum folio order for3306* executable memory. This can often be beneficial to3307* performance if (e.g.) arm64 can contpte-map the folio.3308* Executable memory rarely benefits from readahead, due to its3309* random access nature, so set async_size to 0.3310*3311* Limit to the boundaries of the VMA to avoid reading in any3312* pad that might exist between sections, which would be a waste3313* of memory.3314*/3315struct vm_area_struct *vma = vmf->vma;3316unsigned long start = vma->vm_pgoff;3317unsigned long end = start + vma_pages(vma);3318unsigned long ra_end;33193320ra->order = exec_folio_order();3321ra->start = round_down(vmf->pgoff, 1UL << ra->order);3322ra->start = max(ra->start, start);3323ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order);3324ra_end = min(ra_end, end);3325ra->size = ra_end - ra->start;3326ra->async_size = 0;3327} else {3328/*3329* mmap read-around3330*/3331ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);3332ra->size = ra->ra_pages;3333ra->async_size = ra->ra_pages / 4;3334ra->order = 0;3335}33363337fpin = maybe_unlock_mmap_for_io(vmf, fpin);3338ractl._index = ra->start;3339page_cache_ra_order(&ractl, ra);3340return fpin;3341}33423343/*3344* Asynchronous readahead happens when we find the page and PG_readahead,3345* so we want to possibly extend the readahead further. We return the file that3346* was pinned if we have to drop the mmap_lock in order to do IO.3347*/3348static struct file *do_async_mmap_readahead(struct vm_fault *vmf,3349struct folio *folio)3350{3351struct file *file = vmf->vma->vm_file;3352struct file_ra_state *ra = &file->f_ra;3353DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);3354struct file *fpin = NULL;3355unsigned short mmap_miss;33563357/* If we don't want any read-ahead, don't bother */3358if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)3359return fpin;33603361/*3362* If the folio is locked, we're likely racing against another fault.3363* Don't touch the mmap_miss counter to avoid decreasing it multiple3364* times for a single folio and break the balance with mmap_miss3365* increase in do_sync_mmap_readahead().3366*/3367if (likely(!folio_test_locked(folio))) {3368mmap_miss = READ_ONCE(ra->mmap_miss);3369if (mmap_miss)3370WRITE_ONCE(ra->mmap_miss, --mmap_miss);3371}33723373if (folio_test_readahead(folio)) {3374fpin = maybe_unlock_mmap_for_io(vmf, fpin);3375page_cache_async_ra(&ractl, folio, ra->ra_pages);3376}3377return fpin;3378}33793380static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)3381{3382struct vm_area_struct *vma = vmf->vma;3383vm_fault_t ret = 0;3384pte_t *ptep;33853386/*3387* We might have COW'ed a pagecache folio and might now have an mlocked3388* anon folio mapped. The original pagecache folio is not mlocked and3389* might have been evicted. During a read+clear/modify/write update of3390* the PTE, such as done in do_numa_page()/change_pte_range(), we3391* temporarily clear the PTE under PT lock and might detect it here as3392* "none" when not holding the PT lock.3393*3394* Not rechecking the PTE under PT lock could result in an unexpected3395* major fault in an mlock'ed region. Recheck only for this special3396* scenario while holding the PT lock, to not degrade non-mlocked3397* scenarios. Recheck the PTE without PT lock firstly, thereby reducing3398* the number of times we hold PT lock.3399*/3400if (!(vma->vm_flags & VM_LOCKED))3401return 0;34023403if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))3404return 0;34053406ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address,3407&vmf->ptl);3408if (unlikely(!ptep))3409return VM_FAULT_NOPAGE;34103411if (unlikely(!pte_none(ptep_get_lockless(ptep)))) {3412ret = VM_FAULT_NOPAGE;3413} else {3414spin_lock(vmf->ptl);3415if (unlikely(!pte_none(ptep_get(ptep))))3416ret = VM_FAULT_NOPAGE;3417spin_unlock(vmf->ptl);3418}3419pte_unmap(ptep);3420return ret;3421}34223423/**3424* filemap_fault - read in file data for page fault handling3425* @vmf: struct vm_fault containing details of the fault3426*3427* filemap_fault() is invoked via the vma operations vector for a3428* mapped memory region to read in file data during a page fault.3429*3430* The goto's are kind of ugly, but this streamlines the normal case of having3431* it in the page cache, and handles the special cases reasonably without3432* having a lot of duplicated code.3433*3434* vma->vm_mm->mmap_lock must be held on entry.3435*3436* If our return value has VM_FAULT_RETRY set, it's because the mmap_lock3437* may be dropped before doing I/O or by lock_folio_maybe_drop_mmap().3438*3439* If our return value does not have VM_FAULT_RETRY set, the mmap_lock3440* has not been released.3441*3442* We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.3443*3444* Return: bitwise-OR of %VM_FAULT_ codes.3445*/3446vm_fault_t filemap_fault(struct vm_fault *vmf)3447{3448int error;3449struct file *file = vmf->vma->vm_file;3450struct file *fpin = NULL;3451struct address_space *mapping = file->f_mapping;3452struct inode *inode = mapping->host;3453pgoff_t max_idx, index = vmf->pgoff;3454struct folio *folio;3455vm_fault_t ret = 0;3456bool mapping_locked = false;34573458max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);3459if (unlikely(index >= max_idx))3460return VM_FAULT_SIGBUS;34613462trace_mm_filemap_fault(mapping, index);34633464/*3465* Do we have something in the page cache already?3466*/3467folio = filemap_get_folio(mapping, index);3468if (likely(!IS_ERR(folio))) {3469/*3470* We found the page, so try async readahead before waiting for3471* the lock.3472*/3473if (!(vmf->flags & FAULT_FLAG_TRIED))3474fpin = do_async_mmap_readahead(vmf, folio);3475if (unlikely(!folio_test_uptodate(folio))) {3476filemap_invalidate_lock_shared(mapping);3477mapping_locked = true;3478}3479} else {3480ret = filemap_fault_recheck_pte_none(vmf);3481if (unlikely(ret))3482return ret;34833484/* No page in the page cache at all */3485count_vm_event(PGMAJFAULT);3486count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);3487ret = VM_FAULT_MAJOR;3488fpin = do_sync_mmap_readahead(vmf);3489retry_find:3490/*3491* See comment in filemap_create_folio() why we need3492* invalidate_lock3493*/3494if (!mapping_locked) {3495filemap_invalidate_lock_shared(mapping);3496mapping_locked = true;3497}3498folio = __filemap_get_folio(mapping, index,3499FGP_CREAT|FGP_FOR_MMAP,3500vmf->gfp_mask);3501if (IS_ERR(folio)) {3502if (fpin)3503goto out_retry;3504filemap_invalidate_unlock_shared(mapping);3505return VM_FAULT_OOM;3506}3507}35083509if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))3510goto out_retry;35113512/* Did it get truncated? */3513if (unlikely(folio->mapping != mapping)) {3514folio_unlock(folio);3515folio_put(folio);3516goto retry_find;3517}3518VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);35193520/*3521* We have a locked folio in the page cache, now we need to check3522* that it's up-to-date. If not, it is going to be due to an error,3523* or because readahead was otherwise unable to retrieve it.3524*/3525if (unlikely(!folio_test_uptodate(folio))) {3526/*3527* If the invalidate lock is not held, the folio was in cache3528* and uptodate and now it is not. Strange but possible since we3529* didn't hold the page lock all the time. Let's drop3530* everything, get the invalidate lock and try again.3531*/3532if (!mapping_locked) {3533folio_unlock(folio);3534folio_put(folio);3535goto retry_find;3536}35373538/*3539* OK, the folio is really not uptodate. This can be because the3540* VMA has the VM_RAND_READ flag set, or because an error3541* arose. Let's read it in directly.3542*/3543goto page_not_uptodate;3544}35453546/*3547* We've made it this far and we had to drop our mmap_lock, now is the3548* time to return to the upper layer and have it re-find the vma and3549* redo the fault.3550*/3551if (fpin) {3552folio_unlock(folio);3553goto out_retry;3554}3555if (mapping_locked)3556filemap_invalidate_unlock_shared(mapping);35573558/*3559* Found the page and have a reference on it.3560* We must recheck i_size under page lock.3561*/3562max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);3563if (unlikely(index >= max_idx)) {3564folio_unlock(folio);3565folio_put(folio);3566return VM_FAULT_SIGBUS;3567}35683569vmf->page = folio_file_page(folio, index);3570return ret | VM_FAULT_LOCKED;35713572page_not_uptodate:3573/*3574* Umm, take care of errors if the page isn't up-to-date.3575* Try to re-read it _once_. We do this synchronously,3576* because there really aren't any performance issues here3577* and we need to check for errors.3578*/3579fpin = maybe_unlock_mmap_for_io(vmf, fpin);3580error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);3581if (fpin)3582goto out_retry;3583folio_put(folio);35843585if (!error || error == AOP_TRUNCATED_PAGE)3586goto retry_find;3587filemap_invalidate_unlock_shared(mapping);35883589return VM_FAULT_SIGBUS;35903591out_retry:3592/*3593* We dropped the mmap_lock, we need to return to the fault handler to3594* re-find the vma and come back and find our hopefully still populated3595* page.3596*/3597if (!IS_ERR(folio))3598folio_put(folio);3599if (mapping_locked)3600filemap_invalidate_unlock_shared(mapping);3601if (fpin)3602fput(fpin);3603return ret | VM_FAULT_RETRY;3604}3605EXPORT_SYMBOL(filemap_fault);36063607static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,3608pgoff_t start)3609{3610struct mm_struct *mm = vmf->vma->vm_mm;36113612/* Huge page is mapped? No need to proceed. */3613if (pmd_trans_huge(*vmf->pmd)) {3614folio_unlock(folio);3615folio_put(folio);3616return true;3617}36183619if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {3620struct page *page = folio_file_page(folio, start);3621vm_fault_t ret = do_set_pmd(vmf, folio, page);3622if (!ret) {3623/* The page is mapped successfully, reference consumed. */3624folio_unlock(folio);3625return true;3626}3627}36283629if (pmd_none(*vmf->pmd) && vmf->prealloc_pte)3630pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);36313632return false;3633}36343635static struct folio *next_uptodate_folio(struct xa_state *xas,3636struct address_space *mapping, pgoff_t end_pgoff)3637{3638struct folio *folio = xas_next_entry(xas, end_pgoff);3639unsigned long max_idx;36403641do {3642if (!folio)3643return NULL;3644if (xas_retry(xas, folio))3645continue;3646if (xa_is_value(folio))3647continue;3648if (!folio_try_get(folio))3649continue;3650if (folio_test_locked(folio))3651goto skip;3652/* Has the page moved or been split? */3653if (unlikely(folio != xas_reload(xas)))3654goto skip;3655if (!folio_test_uptodate(folio) || folio_test_readahead(folio))3656goto skip;3657if (!folio_trylock(folio))3658goto skip;3659if (folio->mapping != mapping)3660goto unlock;3661if (!folio_test_uptodate(folio))3662goto unlock;3663max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);3664if (xas->xa_index >= max_idx)3665goto unlock;3666return folio;3667unlock:3668folio_unlock(folio);3669skip:3670folio_put(folio);3671} while ((folio = xas_next_entry(xas, end_pgoff)) != NULL);36723673return NULL;3674}36753676/*3677* Map page range [start_page, start_page + nr_pages) of folio.3678* start_page is gotten from start by folio_page(folio, start)3679*/3680static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,3681struct folio *folio, unsigned long start,3682unsigned long addr, unsigned int nr_pages,3683unsigned long *rss, unsigned short *mmap_miss)3684{3685unsigned int ref_from_caller = 1;3686vm_fault_t ret = 0;3687struct page *page = folio_page(folio, start);3688unsigned int count = 0;3689pte_t *old_ptep = vmf->pte;3690unsigned long addr0;36913692/*3693* Map the large folio fully where possible.3694*3695* The folio must not cross VMA or page table boundary.3696*/3697addr0 = addr - start * PAGE_SIZE;3698if (folio_within_vma(folio, vmf->vma) &&3699(addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) {3700vmf->pte -= start;3701page -= start;3702addr = addr0;3703nr_pages = folio_nr_pages(folio);3704}37053706do {3707if (PageHWPoison(page + count))3708goto skip;37093710/*3711* If there are too many folios that are recently evicted3712* in a file, they will probably continue to be evicted.3713* In such situation, read-ahead is only a waste of IO.3714* Don't decrease mmap_miss in this scenario to make sure3715* we can stop read-ahead.3716*/3717if (!folio_test_workingset(folio))3718(*mmap_miss)++;37193720/*3721* NOTE: If there're PTE markers, we'll leave them to be3722* handled in the specific fault path, and it'll prohibit the3723* fault-around logic.3724*/3725if (!pte_none(ptep_get(&vmf->pte[count])))3726goto skip;37273728count++;3729continue;3730skip:3731if (count) {3732set_pte_range(vmf, folio, page, count, addr);3733*rss += count;3734folio_ref_add(folio, count - ref_from_caller);3735ref_from_caller = 0;3736if (in_range(vmf->address, addr, count * PAGE_SIZE))3737ret = VM_FAULT_NOPAGE;3738}37393740count++;3741page += count;3742vmf->pte += count;3743addr += count * PAGE_SIZE;3744count = 0;3745} while (--nr_pages > 0);37463747if (count) {3748set_pte_range(vmf, folio, page, count, addr);3749*rss += count;3750folio_ref_add(folio, count - ref_from_caller);3751ref_from_caller = 0;3752if (in_range(vmf->address, addr, count * PAGE_SIZE))3753ret = VM_FAULT_NOPAGE;3754}37553756vmf->pte = old_ptep;3757if (ref_from_caller)3758/* Locked folios cannot get truncated. */3759folio_ref_dec(folio);37603761return ret;3762}37633764static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,3765struct folio *folio, unsigned long addr,3766unsigned long *rss, unsigned short *mmap_miss)3767{3768vm_fault_t ret = 0;3769struct page *page = &folio->page;37703771if (PageHWPoison(page))3772goto out;37733774/* See comment of filemap_map_folio_range() */3775if (!folio_test_workingset(folio))3776(*mmap_miss)++;37773778/*3779* NOTE: If there're PTE markers, we'll leave them to be3780* handled in the specific fault path, and it'll prohibit3781* the fault-around logic.3782*/3783if (!pte_none(ptep_get(vmf->pte)))3784goto out;37853786if (vmf->address == addr)3787ret = VM_FAULT_NOPAGE;37883789set_pte_range(vmf, folio, page, 1, addr);3790(*rss)++;3791return ret;37923793out:3794/* Locked folios cannot get truncated. */3795folio_ref_dec(folio);3796return ret;3797}37983799vm_fault_t filemap_map_pages(struct vm_fault *vmf,3800pgoff_t start_pgoff, pgoff_t end_pgoff)3801{3802struct vm_area_struct *vma = vmf->vma;3803struct file *file = vma->vm_file;3804struct address_space *mapping = file->f_mapping;3805pgoff_t file_end, last_pgoff = start_pgoff;3806unsigned long addr;3807XA_STATE(xas, &mapping->i_pages, start_pgoff);3808struct folio *folio;3809vm_fault_t ret = 0;3810unsigned long rss = 0;3811unsigned int nr_pages = 0, folio_type;3812unsigned short mmap_miss = 0, mmap_miss_saved;38133814rcu_read_lock();3815folio = next_uptodate_folio(&xas, mapping, end_pgoff);3816if (!folio)3817goto out;38183819if (filemap_map_pmd(vmf, folio, start_pgoff)) {3820ret = VM_FAULT_NOPAGE;3821goto out;3822}38233824addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);3825vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);3826if (!vmf->pte) {3827folio_unlock(folio);3828folio_put(folio);3829goto out;3830}38313832file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;3833if (end_pgoff > file_end)3834end_pgoff = file_end;38353836folio_type = mm_counter_file(folio);3837do {3838unsigned long end;38393840addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;3841vmf->pte += xas.xa_index - last_pgoff;3842last_pgoff = xas.xa_index;3843end = folio_next_index(folio) - 1;3844nr_pages = min(end, end_pgoff) - xas.xa_index + 1;38453846if (!folio_test_large(folio))3847ret |= filemap_map_order0_folio(vmf,3848folio, addr, &rss, &mmap_miss);3849else3850ret |= filemap_map_folio_range(vmf, folio,3851xas.xa_index - folio->index, addr,3852nr_pages, &rss, &mmap_miss);38533854folio_unlock(folio);3855} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);3856add_mm_counter(vma->vm_mm, folio_type, rss);3857pte_unmap_unlock(vmf->pte, vmf->ptl);3858trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff);3859out:3860rcu_read_unlock();38613862mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss);3863if (mmap_miss >= mmap_miss_saved)3864WRITE_ONCE(file->f_ra.mmap_miss, 0);3865else3866WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss);38673868return ret;3869}3870EXPORT_SYMBOL(filemap_map_pages);38713872vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)3873{3874struct address_space *mapping = vmf->vma->vm_file->f_mapping;3875struct folio *folio = page_folio(vmf->page);3876vm_fault_t ret = VM_FAULT_LOCKED;38773878sb_start_pagefault(mapping->host->i_sb);3879file_update_time(vmf->vma->vm_file);3880folio_lock(folio);3881if (folio->mapping != mapping) {3882folio_unlock(folio);3883ret = VM_FAULT_NOPAGE;3884goto out;3885}3886/*3887* We mark the folio dirty already here so that when freeze is in3888* progress, we are guaranteed that writeback during freezing will3889* see the dirty folio and writeprotect it again.3890*/3891folio_mark_dirty(folio);3892folio_wait_stable(folio);3893out:3894sb_end_pagefault(mapping->host->i_sb);3895return ret;3896}38973898const struct vm_operations_struct generic_file_vm_ops = {3899.fault = filemap_fault,3900.map_pages = filemap_map_pages,3901.page_mkwrite = filemap_page_mkwrite,3902};39033904/* This is used for a general mmap of a disk file */39053906int generic_file_mmap(struct file *file, struct vm_area_struct *vma)3907{3908struct address_space *mapping = file->f_mapping;39093910if (!mapping->a_ops->read_folio)3911return -ENOEXEC;3912file_accessed(file);3913vma->vm_ops = &generic_file_vm_ops;3914return 0;3915}39163917int generic_file_mmap_prepare(struct vm_area_desc *desc)3918{3919struct file *file = desc->file;3920struct address_space *mapping = file->f_mapping;39213922if (!mapping->a_ops->read_folio)3923return -ENOEXEC;3924file_accessed(file);3925desc->vm_ops = &generic_file_vm_ops;3926return 0;3927}39283929/*3930* This is for filesystems which do not implement ->writepage.3931*/3932int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)3933{3934if (vma_is_shared_maywrite(vma))3935return -EINVAL;3936return generic_file_mmap(file, vma);3937}39383939int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)3940{3941if (is_shared_maywrite(desc->vm_flags))3942return -EINVAL;3943return generic_file_mmap_prepare(desc);3944}3945#else3946vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)3947{3948return VM_FAULT_SIGBUS;3949}3950int generic_file_mmap(struct file *file, struct vm_area_struct *vma)3951{3952return -ENOSYS;3953}3954int generic_file_mmap_prepare(struct vm_area_desc *desc)3955{3956return -ENOSYS;3957}3958int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)3959{3960return -ENOSYS;3961}3962int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)3963{3964return -ENOSYS;3965}3966#endif /* CONFIG_MMU */39673968EXPORT_SYMBOL(filemap_page_mkwrite);3969EXPORT_SYMBOL(generic_file_mmap);3970EXPORT_SYMBOL(generic_file_mmap_prepare);3971EXPORT_SYMBOL(generic_file_readonly_mmap);3972EXPORT_SYMBOL(generic_file_readonly_mmap_prepare);39733974static struct folio *do_read_cache_folio(struct address_space *mapping,3975pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)3976{3977struct folio *folio;3978int err;39793980if (!filler)3981filler = mapping->a_ops->read_folio;3982repeat:3983folio = filemap_get_folio(mapping, index);3984if (IS_ERR(folio)) {3985folio = filemap_alloc_folio(gfp,3986mapping_min_folio_order(mapping));3987if (!folio)3988return ERR_PTR(-ENOMEM);3989index = mapping_align_index(mapping, index);3990err = filemap_add_folio(mapping, folio, index, gfp);3991if (unlikely(err)) {3992folio_put(folio);3993if (err == -EEXIST)3994goto repeat;3995/* Presumably ENOMEM for xarray node */3996return ERR_PTR(err);3997}39983999goto filler;4000}4001if (folio_test_uptodate(folio))4002goto out;40034004if (!folio_trylock(folio)) {4005folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);4006goto repeat;4007}40084009/* Folio was truncated from mapping */4010if (!folio->mapping) {4011folio_unlock(folio);4012folio_put(folio);4013goto repeat;4014}40154016/* Someone else locked and filled the page in a very small window */4017if (folio_test_uptodate(folio)) {4018folio_unlock(folio);4019goto out;4020}40214022filler:4023err = filemap_read_folio(file, filler, folio);4024if (err) {4025folio_put(folio);4026if (err == AOP_TRUNCATED_PAGE)4027goto repeat;4028return ERR_PTR(err);4029}40304031out:4032folio_mark_accessed(folio);4033return folio;4034}40354036/**4037* read_cache_folio - Read into page cache, fill it if needed.4038* @mapping: The address_space to read from.4039* @index: The index to read.4040* @filler: Function to perform the read, or NULL to use aops->read_folio().4041* @file: Passed to filler function, may be NULL if not required.4042*4043* Read one page into the page cache. If it succeeds, the folio returned4044* will contain @index, but it may not be the first page of the folio.4045*4046* If the filler function returns an error, it will be returned to the4047* caller.4048*4049* Context: May sleep. Expects mapping->invalidate_lock to be held.4050* Return: An uptodate folio on success, ERR_PTR() on failure.4051*/4052struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,4053filler_t filler, struct file *file)4054{4055return do_read_cache_folio(mapping, index, filler, file,4056mapping_gfp_mask(mapping));4057}4058EXPORT_SYMBOL(read_cache_folio);40594060/**4061* mapping_read_folio_gfp - Read into page cache, using specified allocation flags.4062* @mapping: The address_space for the folio.4063* @index: The index that the allocated folio will contain.4064* @gfp: The page allocator flags to use if allocating.4065*4066* This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with4067* any new memory allocations done using the specified allocation flags.4068*4069* The most likely error from this function is EIO, but ENOMEM is4070* possible and so is EINTR. If ->read_folio returns another error,4071* that will be returned to the caller.4072*4073* The function expects mapping->invalidate_lock to be already held.4074*4075* Return: Uptodate folio on success, ERR_PTR() on failure.4076*/4077struct folio *mapping_read_folio_gfp(struct address_space *mapping,4078pgoff_t index, gfp_t gfp)4079{4080return do_read_cache_folio(mapping, index, NULL, NULL, gfp);4081}4082EXPORT_SYMBOL(mapping_read_folio_gfp);40834084static struct page *do_read_cache_page(struct address_space *mapping,4085pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp)4086{4087struct folio *folio;40884089folio = do_read_cache_folio(mapping, index, filler, file, gfp);4090if (IS_ERR(folio))4091return &folio->page;4092return folio_file_page(folio, index);4093}40944095struct page *read_cache_page(struct address_space *mapping,4096pgoff_t index, filler_t *filler, struct file *file)4097{4098return do_read_cache_page(mapping, index, filler, file,4099mapping_gfp_mask(mapping));4100}4101EXPORT_SYMBOL(read_cache_page);41024103/**4104* read_cache_page_gfp - read into page cache, using specified page allocation flags.4105* @mapping: the page's address_space4106* @index: the page index4107* @gfp: the page allocator flags to use if allocating4108*4109* This is the same as "read_mapping_page(mapping, index, NULL)", but with4110* any new page allocations done using the specified allocation flags.4111*4112* If the page does not get brought uptodate, return -EIO.4113*4114* The function expects mapping->invalidate_lock to be already held.4115*4116* Return: up to date page on success, ERR_PTR() on failure.4117*/4118struct page *read_cache_page_gfp(struct address_space *mapping,4119pgoff_t index,4120gfp_t gfp)4121{4122return do_read_cache_page(mapping, index, NULL, NULL, gfp);4123}4124EXPORT_SYMBOL(read_cache_page_gfp);41254126/*4127* Warn about a page cache invalidation failure during a direct I/O write.4128*/4129static void dio_warn_stale_pagecache(struct file *filp)4130{4131static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);4132char pathname[128];4133char *path;41344135errseq_set(&filp->f_mapping->wb_err, -EIO);4136if (__ratelimit(&_rs)) {4137path = file_path(filp, pathname, sizeof(pathname));4138if (IS_ERR(path))4139path = "(unknown)";4140pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n");4141pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,4142current->comm);4143}4144}41454146void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)4147{4148struct address_space *mapping = iocb->ki_filp->f_mapping;41494150if (mapping->nrpages &&4151invalidate_inode_pages2_range(mapping,4152iocb->ki_pos >> PAGE_SHIFT,4153(iocb->ki_pos + count - 1) >> PAGE_SHIFT))4154dio_warn_stale_pagecache(iocb->ki_filp);4155}41564157ssize_t4158generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)4159{4160struct address_space *mapping = iocb->ki_filp->f_mapping;4161size_t write_len = iov_iter_count(from);4162ssize_t written;41634164/*4165* If a page can not be invalidated, return 0 to fall back4166* to buffered write.4167*/4168written = kiocb_invalidate_pages(iocb, write_len);4169if (written) {4170if (written == -EBUSY)4171return 0;4172return written;4173}41744175written = mapping->a_ops->direct_IO(iocb, from);41764177/*4178* Finally, try again to invalidate clean pages which might have been4179* cached by non-direct readahead, or faulted in by get_user_pages()4180* if the source of the write was an mmap'ed region of the file4181* we're writing. Either one is a pretty crazy thing to do,4182* so we don't support it 100%. If this invalidation4183* fails, tough, the write still worked...4184*4185* Most of the time we do not need this since dio_complete() will do4186* the invalidation for us. However there are some file systems that4187* do not end up with dio_complete() being called, so let's not break4188* them by removing it completely.4189*4190* Noticeable example is a blkdev_direct_IO().4191*4192* Skip invalidation for async writes or if mapping has no pages.4193*/4194if (written > 0) {4195struct inode *inode = mapping->host;4196loff_t pos = iocb->ki_pos;41974198kiocb_invalidate_post_direct_write(iocb, written);4199pos += written;4200write_len -= written;4201if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {4202i_size_write(inode, pos);4203mark_inode_dirty(inode);4204}4205iocb->ki_pos = pos;4206}4207if (written != -EIOCBQUEUED)4208iov_iter_revert(from, write_len - iov_iter_count(from));4209return written;4210}4211EXPORT_SYMBOL(generic_file_direct_write);42124213ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)4214{4215struct file *file = iocb->ki_filp;4216loff_t pos = iocb->ki_pos;4217struct address_space *mapping = file->f_mapping;4218const struct address_space_operations *a_ops = mapping->a_ops;4219size_t chunk = mapping_max_folio_size(mapping);4220long status = 0;4221ssize_t written = 0;42224223do {4224struct folio *folio;4225size_t offset; /* Offset into folio */4226size_t bytes; /* Bytes to write to folio */4227size_t copied; /* Bytes copied from user */4228void *fsdata = NULL;42294230bytes = iov_iter_count(i);4231retry:4232offset = pos & (chunk - 1);4233bytes = min(chunk - offset, bytes);4234balance_dirty_pages_ratelimited(mapping);42354236if (fatal_signal_pending(current)) {4237status = -EINTR;4238break;4239}42404241status = a_ops->write_begin(iocb, mapping, pos, bytes,4242&folio, &fsdata);4243if (unlikely(status < 0))4244break;42454246offset = offset_in_folio(folio, pos);4247if (bytes > folio_size(folio) - offset)4248bytes = folio_size(folio) - offset;42494250if (mapping_writably_mapped(mapping))4251flush_dcache_folio(folio);42524253/*4254* Faults here on mmap()s can recurse into arbitrary4255* filesystem code. Lots of locks are held that can4256* deadlock. Use an atomic copy to avoid deadlocking4257* in page fault handling.4258*/4259copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);4260flush_dcache_folio(folio);42614262status = a_ops->write_end(iocb, mapping, pos, bytes, copied,4263folio, fsdata);4264if (unlikely(status != copied)) {4265iov_iter_revert(i, copied - max(status, 0L));4266if (unlikely(status < 0))4267break;4268}4269cond_resched();42704271if (unlikely(status == 0)) {4272/*4273* A short copy made ->write_end() reject the4274* thing entirely. Might be memory poisoning4275* halfway through, might be a race with munmap,4276* might be severe memory pressure.4277*/4278if (chunk > PAGE_SIZE)4279chunk /= 2;4280if (copied) {4281bytes = copied;4282goto retry;4283}42844285/*4286* 'folio' is now unlocked and faults on it can be4287* handled. Ensure forward progress by trying to4288* fault it in now.4289*/4290if (fault_in_iov_iter_readable(i, bytes) == bytes) {4291status = -EFAULT;4292break;4293}4294} else {4295pos += status;4296written += status;4297}4298} while (iov_iter_count(i));42994300if (!written)4301return status;4302iocb->ki_pos += written;4303return written;4304}4305EXPORT_SYMBOL(generic_perform_write);43064307/**4308* __generic_file_write_iter - write data to a file4309* @iocb: IO state structure (file, offset, etc.)4310* @from: iov_iter with data to write4311*4312* This function does all the work needed for actually writing data to a4313* file. It does all basic checks, removes SUID from the file, updates4314* modification times and calls proper subroutines depending on whether we4315* do direct IO or a standard buffered write.4316*4317* It expects i_rwsem to be grabbed unless we work on a block device or similar4318* object which does not need locking at all.4319*4320* This function does *not* take care of syncing data in case of O_SYNC write.4321* A caller has to handle it. This is mainly due to the fact that we want to4322* avoid syncing under i_rwsem.4323*4324* Return:4325* * number of bytes written, even for truncated writes4326* * negative error code if no data has been written at all4327*/4328ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)4329{4330struct file *file = iocb->ki_filp;4331struct address_space *mapping = file->f_mapping;4332struct inode *inode = mapping->host;4333ssize_t ret;43344335ret = file_remove_privs(file);4336if (ret)4337return ret;43384339ret = file_update_time(file);4340if (ret)4341return ret;43424343if (iocb->ki_flags & IOCB_DIRECT) {4344ret = generic_file_direct_write(iocb, from);4345/*4346* If the write stopped short of completing, fall back to4347* buffered writes. Some filesystems do this for writes to4348* holes, for example. For DAX files, a buffered write will4349* not succeed (even if it did, DAX does not handle dirty4350* page-cache pages correctly).4351*/4352if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))4353return ret;4354return direct_write_fallback(iocb, from, ret,4355generic_perform_write(iocb, from));4356}43574358return generic_perform_write(iocb, from);4359}4360EXPORT_SYMBOL(__generic_file_write_iter);43614362/**4363* generic_file_write_iter - write data to a file4364* @iocb: IO state structure4365* @from: iov_iter with data to write4366*4367* This is a wrapper around __generic_file_write_iter() to be used by most4368* filesystems. It takes care of syncing the file in case of O_SYNC file4369* and acquires i_rwsem as needed.4370* Return:4371* * negative error code if no data has been written at all of4372* vfs_fsync_range() failed for a synchronous write4373* * number of bytes written, even for truncated writes4374*/4375ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)4376{4377struct file *file = iocb->ki_filp;4378struct inode *inode = file->f_mapping->host;4379ssize_t ret;43804381inode_lock(inode);4382ret = generic_write_checks(iocb, from);4383if (ret > 0)4384ret = __generic_file_write_iter(iocb, from);4385inode_unlock(inode);43864387if (ret > 0)4388ret = generic_write_sync(iocb, ret);4389return ret;4390}4391EXPORT_SYMBOL(generic_file_write_iter);43924393/**4394* filemap_release_folio() - Release fs-specific metadata on a folio.4395* @folio: The folio which the kernel is trying to free.4396* @gfp: Memory allocation flags (and I/O mode).4397*4398* The address_space is trying to release any data attached to a folio4399* (presumably at folio->private).4400*4401* This will also be called if the private_2 flag is set on a page,4402* indicating that the folio has other metadata associated with it.4403*4404* The @gfp argument specifies whether I/O may be performed to release4405* this page (__GFP_IO), and whether the call may block4406* (__GFP_RECLAIM & __GFP_FS).4407*4408* Return: %true if the release was successful, otherwise %false.4409*/4410bool filemap_release_folio(struct folio *folio, gfp_t gfp)4411{4412struct address_space * const mapping = folio->mapping;44134414BUG_ON(!folio_test_locked(folio));4415if (!folio_needs_release(folio))4416return true;4417if (folio_test_writeback(folio))4418return false;44194420if (mapping && mapping->a_ops->release_folio)4421return mapping->a_ops->release_folio(folio, gfp);4422return try_to_free_buffers(folio);4423}4424EXPORT_SYMBOL(filemap_release_folio);44254426/**4427* filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache4428* @inode: The inode to flush4429* @flush: Set to write back rather than simply invalidate.4430* @start: First byte to in range.4431* @end: Last byte in range (inclusive), or LLONG_MAX for everything from start4432* onwards.4433*4434* Invalidate all the folios on an inode that contribute to the specified4435* range, possibly writing them back first. Whilst the operation is4436* undertaken, the invalidate lock is held to prevent new folios from being4437* installed.4438*/4439int filemap_invalidate_inode(struct inode *inode, bool flush,4440loff_t start, loff_t end)4441{4442struct address_space *mapping = inode->i_mapping;4443pgoff_t first = start >> PAGE_SHIFT;4444pgoff_t last = end >> PAGE_SHIFT;4445pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1;44464447if (!mapping || !mapping->nrpages || end < start)4448goto out;44494450/* Prevent new folios from being added to the inode. */4451filemap_invalidate_lock(mapping);44524453if (!mapping->nrpages)4454goto unlock;44554456unmap_mapping_pages(mapping, first, nr, false);44574458/* Write back the data if we're asked to. */4459if (flush) {4460struct writeback_control wbc = {4461.sync_mode = WB_SYNC_ALL,4462.nr_to_write = LONG_MAX,4463.range_start = start,4464.range_end = end,4465};44664467filemap_fdatawrite_wbc(mapping, &wbc);4468}44694470/* Wait for writeback to complete on all folios and discard. */4471invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE);44724473unlock:4474filemap_invalidate_unlock(mapping);4475out:4476return filemap_check_errors(mapping);4477}4478EXPORT_SYMBOL_GPL(filemap_invalidate_inode);44794480#ifdef CONFIG_CACHESTAT_SYSCALL4481/**4482* filemap_cachestat() - compute the page cache statistics of a mapping4483* @mapping: The mapping to compute the statistics for.4484* @first_index: The starting page cache index.4485* @last_index: The final page index (inclusive).4486* @cs: the cachestat struct to write the result to.4487*4488* This will query the page cache statistics of a mapping in the4489* page range of [first_index, last_index] (inclusive). The statistics4490* queried include: number of dirty pages, number of pages marked for4491* writeback, and the number of (recently) evicted pages.4492*/4493static void filemap_cachestat(struct address_space *mapping,4494pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)4495{4496XA_STATE(xas, &mapping->i_pages, first_index);4497struct folio *folio;44984499/* Flush stats (and potentially sleep) outside the RCU read section. */4500mem_cgroup_flush_stats_ratelimited(NULL);45014502rcu_read_lock();4503xas_for_each(&xas, folio, last_index) {4504int order;4505unsigned long nr_pages;4506pgoff_t folio_first_index, folio_last_index;45074508/*4509* Don't deref the folio. It is not pinned, and might4510* get freed (and reused) underneath us.4511*4512* We *could* pin it, but that would be expensive for4513* what should be a fast and lightweight syscall.4514*4515* Instead, derive all information of interest from4516* the rcu-protected xarray.4517*/45184519if (xas_retry(&xas, folio))4520continue;45214522order = xas_get_order(&xas);4523nr_pages = 1 << order;4524folio_first_index = round_down(xas.xa_index, 1 << order);4525folio_last_index = folio_first_index + nr_pages - 1;45264527/* Folios might straddle the range boundaries, only count covered pages */4528if (folio_first_index < first_index)4529nr_pages -= first_index - folio_first_index;45304531if (folio_last_index > last_index)4532nr_pages -= folio_last_index - last_index;45334534if (xa_is_value(folio)) {4535/* page is evicted */4536void *shadow = (void *)folio;4537bool workingset; /* not used */45384539cs->nr_evicted += nr_pages;45404541#ifdef CONFIG_SWAP /* implies CONFIG_MMU */4542if (shmem_mapping(mapping)) {4543/* shmem file - in swap cache */4544swp_entry_t swp = radix_to_swp_entry(folio);45454546/* swapin error results in poisoned entry */4547if (non_swap_entry(swp))4548goto resched;45494550/*4551* Getting a swap entry from the shmem4552* inode means we beat4553* shmem_unuse(). rcu_read_lock()4554* ensures swapoff waits for us before4555* freeing the swapper space. However,4556* we can race with swapping and4557* invalidation, so there might not be4558* a shadow in the swapcache (yet).4559*/4560shadow = swap_cache_get_shadow(swp);4561if (!shadow)4562goto resched;4563}4564#endif4565if (workingset_test_recent(shadow, true, &workingset, false))4566cs->nr_recently_evicted += nr_pages;45674568goto resched;4569}45704571/* page is in cache */4572cs->nr_cache += nr_pages;45734574if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))4575cs->nr_dirty += nr_pages;45764577if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))4578cs->nr_writeback += nr_pages;45794580resched:4581if (need_resched()) {4582xas_pause(&xas);4583cond_resched_rcu();4584}4585}4586rcu_read_unlock();4587}45884589/*4590* See mincore: reveal pagecache information only for files4591* that the calling process has write access to, or could (if4592* tried) open for writing.4593*/4594static inline bool can_do_cachestat(struct file *f)4595{4596if (f->f_mode & FMODE_WRITE)4597return true;4598if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f)))4599return true;4600return file_permission(f, MAY_WRITE) == 0;4601}46024603/*4604* The cachestat(2) system call.4605*4606* cachestat() returns the page cache statistics of a file in the4607* bytes range specified by `off` and `len`: number of cached pages,4608* number of dirty pages, number of pages marked for writeback,4609* number of evicted pages, and number of recently evicted pages.4610*4611* An evicted page is a page that is previously in the page cache4612* but has been evicted since. A page is recently evicted if its last4613* eviction was recent enough that its reentry to the cache would4614* indicate that it is actively being used by the system, and that4615* there is memory pressure on the system.4616*4617* `off` and `len` must be non-negative integers. If `len` > 0,4618* the queried range is [`off`, `off` + `len`]. If `len` == 0,4619* we will query in the range from `off` to the end of the file.4620*4621* The `flags` argument is unused for now, but is included for future4622* extensibility. User should pass 0 (i.e no flag specified).4623*4624* Currently, hugetlbfs is not supported.4625*4626* Because the status of a page can change after cachestat() checks it4627* but before it returns to the application, the returned values may4628* contain stale information.4629*4630* return values:4631* zero - success4632* -EFAULT - cstat or cstat_range points to an illegal address4633* -EINVAL - invalid flags4634* -EBADF - invalid file descriptor4635* -EOPNOTSUPP - file descriptor is of a hugetlbfs file4636*/4637SYSCALL_DEFINE4(cachestat, unsigned int, fd,4638struct cachestat_range __user *, cstat_range,4639struct cachestat __user *, cstat, unsigned int, flags)4640{4641CLASS(fd, f)(fd);4642struct address_space *mapping;4643struct cachestat_range csr;4644struct cachestat cs;4645pgoff_t first_index, last_index;46464647if (fd_empty(f))4648return -EBADF;46494650if (copy_from_user(&csr, cstat_range,4651sizeof(struct cachestat_range)))4652return -EFAULT;46534654/* hugetlbfs is not supported */4655if (is_file_hugepages(fd_file(f)))4656return -EOPNOTSUPP;46574658if (!can_do_cachestat(fd_file(f)))4659return -EPERM;46604661if (flags != 0)4662return -EINVAL;46634664first_index = csr.off >> PAGE_SHIFT;4665last_index =4666csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;4667memset(&cs, 0, sizeof(struct cachestat));4668mapping = fd_file(f)->f_mapping;4669filemap_cachestat(mapping, first_index, last_index, &cs);46704671if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))4672return -EFAULT;46734674return 0;4675}4676#endif /* CONFIG_CACHESTAT_SYSCALL */467746784679