文件系统的读写,就是调用系统函数 read 和 write,读和写的很多逻辑是相似的。
read 和 write 系统调用在内核里面的定义
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{struct fd f = fdget_pos(fd);
......loff_t pos = file_pos_read(f.file);ret = vfs_read(f.file, buf, count, &pos);
......
}SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,size_t, count)
{struct fd f = fdget_pos(fd);
......loff_t pos = file_pos_read(f.file);ret = vfs_write(f.file, buf, count, &pos);
......
}
read 调用vfs_read->__vfs_read
write 调用 vfs_write->__vfs_write
__vfs_read 和 __vfs_write 函数
ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,loff_t *pos)
{if (file->f_op->read)return file->f_op->read(file, buf, count, pos);else if (file->f_op->read_iter)return new_sync_read(file, buf, count, pos);elsereturn -EINVAL;
}ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,loff_t *pos)
{if (file->f_op->write)return file->f_op->write(file, p, count, pos);else if (file->f_op->write_iter)return new_sync_write(file, p, count, pos);elsereturn -EINVAL;
}
\linux-4.13.16\fs\read_write.c
每一个打开的文件,都有一个 struct file 结构。这里面有一个 struct file_operations f_op,用于定义对这个文件做的操作。__vfs_read 会调用相应文件系统的 file_operations 里面的 read 操作,__vfs_write 会调用相应文件系统 file_operations 里的 write 操作。
\linux-4.13.16\fs\ext4\file.c
const struct file_operations ext4_file_operations = {
.......read_iter = ext4_file_read_iter,.write_iter = ext4_file_write_iter,
......
}
read 和 write 函数会调用 ext4_file_read_iter 和 ext4_file_write_iter。
ext4_file_read_iter 会调用 generic_file_read_iter,ext4_file_write_iter 会调用 __generic_file_write_iter。
\linux-4.13.16\mm\filemap.c
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
......if (iocb->ki_flags & IOCB_DIRECT) {
......struct address_space *mapping = file->f_mapping;
......retval = mapping->a_ops->direct_IO(iocb, iter);}
......retval = generic_file_buffered_read(iocb, iter, retval);
}ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
......if (iocb->ki_flags & IOCB_DIRECT) {
......written = generic_file_direct_write(iocb, from);
......} else {
......written = generic_perform_write(file, from, iocb->ki_pos);
......}
}
generic_file_read_iter 和 __generic_file_write_iter 有相似的逻辑,就是要区分是否用缓存。缓存其实就是内存中的一块空间。
根据是否使用内存做缓存,可以把文件的 I/O 操作分为两种类型
第一种类型是缓存 I/O
多数文件系统的默认模式
第二种类型是直接 IO
应用程序直接访问磁盘数据
对于 ext4 文件系统来讲, address_space 的操作定义在 ext4_aops,direct_IO 对应的函数是 ext4_direct_IO。
ext4_direct_IO 最终会调用到 __blockdev_direct_IO->do_blockdev_direct_IO,跨过了缓存层,到了通用块层,最终到了文件系统的设备驱动层。
带缓存的写入函数
\linux-4.13.16\mm\filemap.c
ssize_t generic_perform_write(struct file *file,struct iov_iter *i, loff_t pos)
{struct address_space *mapping = file->f_mapping;const struct address_space_operations *a_ops = mapping->a_ops;do {struct page *page;unsigned long offset; /* Offset into pagecache page */unsigned long bytes; /* Bytes to write to page */status = a_ops->write_begin(file, mapping, pos, bytes, flags,&page, &fsdata);copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);flush_dcache_page(page);status = a_ops->write_end(file, mapping, pos, bytes, copied,page, fsdata);pos += copied;written += copied;balance_dirty_pages_ratelimited(mapping);} while (iov_iter_count(i));
}
在 while 循环中, 找出写入影响的页, 并依次写入, 完成以下四步
ext4 是一种日志文件系统,是为了防止突然断电的时候的数据丢失,引入了日志 (Journal) 模式
文件分为文件的元数据和数据, 其操作日志页分开维护
ext4_write_begin 中 ext4_journal_start 是在做日志相关的工作
在 ext4_write_begin 中,还做了另外一件重要的事情,就是调用 grab_cache_page_write_begin,来得到应该写入的缓存页。
内核中缓存以页为单位, 打开文件的 struct file结构中struct address_space 用于关联文件和内存,这个结构里面有基数树 radix tree 保存所有与这个文件相关的的缓存页
struct address_space {struct inode *host; /* owner: inode, block_device */struct radix_tree_root page_tree; /* radix tree of all pages */spinlock_t tree_lock; /* and lock protecting it */
......
}
\linux-4.13.16\lib\iov_iter.c
size_t iov_iter_copy_from_user_atomic(struct page *page,struct iov_iter *i, unsigned long offset, size_t bytes)
{char *kaddr = kmap_atomic(page), *p = kaddr + offset;iterate_all_kinds(i, bytes, v,copyin((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,v.bv_offset, v.bv_len),memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len))kunmap_atomic(kaddr);return bytes;
}
将分配好的页面调用 kmap_atomic 映射到内核里面的一个虚拟地址;将用户态的数据拷贝到内核态的页面的虚拟地址中;调用 kunmap_atomic 解映射
调用 ext4_journal_stop 完成日志的写入
将修改过的缓存标记为脏页,调用链为 block_write_end->__block_commit_write->mark_buffer_dirty
并没有真正写入硬盘,仅仅是写入缓存后,标记为脏页。
将写入的页面真正写到硬盘中,称为回写(Write Back)。
balance_dirty_pages_ratelimited 函数
\linux-4.13.16\mm\page-writeback.c
/*** balance_dirty_pages_ratelimited - balance dirty memory state* @mapping: address_space which was dirtied** Processes which are dirtying memory should call in here once for each page* which was newly dirtied. The function will periodically check the system's* dirty state and will initiate writeback if needed.*/
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{struct inode *inode = mapping->host;struct backing_dev_info *bdi = inode_to_bdi(inode);struct bdi_writeback *wb = NULL;int ratelimit;
......if (unlikely(current->nr_dirtied >= ratelimit))balance_dirty_pages(mapping, wb, current->nr_dirtied);
......
}
若发先脏页超额, 调用 balance_dirty_pages->wb_start_background_writeback 启动一个线程执行回写.
\linux-4.13.16\fs\fs-writeback.c
void wb_start_background_writeback(struct bdi_writeback *wb)
{/** We just wake up the flusher thread. It will perform background* writeback as soon as there is no other work to do.*/wb_wakeup(wb);
}static void wb_wakeup(struct bdi_writeback *wb)
{spin_lock_bh(&wb->work_lock);if (test_bit(WB_registered, &wb->state))mod_delayed_work(bdi_wq, &wb->dwork, 0);spin_unlock_bh(&wb->work_lock);
}(_tflags) | TIMER_IRQSAFE); \} while (0)/* bdi_wq serves all asynchronous writeback tasks */
struct workqueue_struct *bdi_wq;/*** mod_delayed_work - modify delay of or queue a delayed work* @wq: workqueue to use* @dwork: work to queue* @delay: number of jiffies to wait before queueing** mod_delayed_work_on() on local CPU.*/
static inline bool mod_delayed_work(struct workqueue_struct *wq,struct delayed_work *dwork,unsigned long delay)
{....
回写任务 delayed_work 挂在 bdi_wq 队列, 若delay 设为 0, 马上执行回写
bdi = backing device info 描述块设备信息, 初始化块设备时会初始化 timer, 到时会执行写回函数
其他回写场景:
对应的是函数 generic_file_buffered_read。
static ssize_t generic_file_buffered_read(struct kiocb *iocb,struct iov_iter *iter, ssize_t written)
{struct file *filp = iocb->ki_filp;struct address_space *mapping = filp->f_mapping;struct inode *inode = mapping->host;for (;;) {struct page *page;pgoff_t end_index;loff_t isize;page = find_get_page(mapping, index);if (!page) {if (iocb->ki_flags & IOCB_NOWAIT)goto would_block;page_cache_sync_readahead(mapping,ra, filp,index, last_index - index);page = find_get_page(mapping, index);if (unlikely(page == NULL))goto no_cached_page;}if (PageReadahead(page)) {page_cache_async_readahead(mapping,ra, filp, page,index, last_index - index);}/** Ok, we have the page, and it's up-to-date, so* now we can copy it to user space...*/ret = copy_page_to_iter(page, offset, nr, iter);}
}
generic_file_buffered_read 从 page cache 中判断是否由缓存页
图片来自极客时间趣谈linux操作系统
内核版本不同部分函数名称可能不一样
趣谈Linux操作系统(极客时间)链接:
http://gk.link/a/10iXZ
欢迎大家来一起交流学习