Linux1.2文件系统ext2
ext2是一个比较经典的文件系统类型,在最新的内核中仍然支持该文件系统。
本文就基于ext2来简单的学习一下文件系统的工作流程。
文件读取的流程
Linux1.2-sys_read
asmlinkage int sys_read(unsigned int fd,char * buf,unsigned int count)
{
int error;
struct file * file;
struct inode * inode;
if (fd>=NR_OPEN || !(file=current->files->fd[fd]) || !(inode=file->f_inode))
return -EBADF;
if (!(file->f_mode & 1))
return -EBADF;
if (!file->f_op || !file->f_op->read)
return -EINVAL;
if (!count)
return 0;
error = verify_area(VERIFY_WRITE,buf,count);
if (error)
return error;
return file->f_op->read(inode,file,buf,count);
}
从读取的逻辑上看直接就行调用file的文件操作read函数,基于ext2的文件系统,选择的位于fs/ext2/file.c中的ext2_file_read。
static int ext2_file_read (struct inode * inode, struct file * filp,
char * buf, int count)
{
int read, left, chars;
int block, blocks, offset;
int bhrequest, uptodate;
int clusterblocks;
struct buffer_head ** bhb, ** bhe;
struct buffer_head * bhreq[NBUF];
struct buffer_head * buflist[NBUF];
struct super_block * sb;
unsigned int size;
int err;
if (!inode) {
printk ("ext2_file_read: inode = NULL\n");
return -EINVAL;
}
sb = inode->i_sb; // 获取超级块的信息
if (!S_ISREG(inode->i_mode)) {
ext2_warning (sb, "ext2_file_read", "mode = %07o",
inode->i_mode);
return -EINVAL;
}
offset = filp->f_pos;
size = inode->i_size;
if (offset > size)
left = 0;
else
left = size - offset;
if (left > count)
left = count;
if (left <= 0)
return 0;
read = 0;
block = offset >> EXT2_BLOCK_SIZE_BITS(sb);
offset &= (sb->s_blocksize - 1);
size = (size + sb->s_blocksize - 1) >> EXT2_BLOCK_SIZE_BITS(sb);
blocks = (left + offset + sb->s_blocksize - 1) >> EXT2_BLOCK_SIZE_BITS(sb); // 计算占用的blocks
bhb = bhe = buflist;
if (filp->f_reada) {
if (blocks < read_ahead[MAJOR(inode->i_dev)] >> (EXT2_BLOCK_SIZE_BITS(sb) - 9))
blocks = read_ahead[MAJOR(inode->i_dev)] >> (EXT2_BLOCK_SIZE_BITS(sb) - 9);
if (block + blocks > size)
blocks = size - block;
}
/*
* We do this in a two stage process. We first try and request
* as many blocks as we can, then we wait for the first one to
* complete, and then we try and wrap up as many as are actually
* done. This routine is rather generic, in that it can be used
* in a filesystem by substituting the appropriate function in
* for getblk
*
* This routine is optimized to make maximum use of the various
* buffers and caches.
*/
clusterblocks = 0;
do {
bhrequest = 0;
uptodate = 1;
while (blocks) {
--blocks;
#if 1
if(!clusterblocks) clusterblocks = ext2_getcluster(inode, block);
if(clusterblocks) clusterblocks--;
#endif
*bhb = ext2_getblk (inode, block++, 0, &err); // 获取需要请求的列表信息
if (*bhb && !(*bhb)->b_uptodate) {
uptodate = 0;
bhreq[bhrequest++] = *bhb;
}
if (++bhb == &buflist[NBUF])
bhb = buflist;
/*
* If the block we have on hand is uptodate, go ahead
* and complete processing
*/
if (uptodate)
break;
if (bhb == bhe)
break;
}
/*
* Now request them all
*/
if (bhrequest)
ll_rw_block (READ, bhrequest, bhreq); // 合并请求去读取数据
do {
/*
* Finish off all I/O that has actually completed
*/
if (*bhe) {
wait_on_buffer (*bhe); // 等待所有的io完成
if (!(*bhe)->b_uptodate) { /* read error? */
brelse(*bhe);
if (++bhe == &buflist[NBUF])
bhe = buflist;
left = 0;
break;
}
}
if (left < sb->s_blocksize - offset)
chars = left;
else
chars = sb->s_blocksize - offset;
filp->f_pos += chars;
left -= chars;
read += chars;
if (*bhe) {
memcpy_tofs (buf, offset + (*bhe)->b_data,
chars);
brelse (*bhe);
buf += chars;
} else {
while (chars-- > 0)
put_fs_byte (0, buf++);
}
offset = 0;
if (++bhe == &buflist[NBUF])
bhe = buflist;
} while (left > 0 && bhe != bhb && (!*bhe || !(*bhe)->b_lock));
} while (left > 0);
/*
* Release the read-ahead blocks
*/
while (bhe != bhb) {
brelse (*bhe);
if (++bhe == &buflist[NBUF])
bhe = buflist;
}
if (!read)
return -EIO;
filp->f_reada = 1;
if (!IS_RDONLY(inode)) {
inode->i_atime = CURRENT_TIME;
inode->i_dirt = 1;
}
return read; //返回读取的数据
}
从流程上来看,在获取完成所有的node信息之后就会调用ll_rw_block来进行数据的读取。
ll_rw_block读取数据
/* This function can be used to request a number of buffers from a block
device. Currently the only restriction is that all buffers must belong to
the same device */
void ll_rw_block(int rw, int nr, struct buffer_head * bh[])
{
unsigned int major;
struct request plug;
int correct_size;
struct blk_dev_struct * dev;
int i;
/* Make sure that the first block contains something reasonable */
while (!*bh) {
bh++;
if (--nr <= 0)
return;
};
dev = NULL;
if ((major = MAJOR(bh[0]->b_dev)) < MAX_BLKDEV)
dev = blk_dev + major;
if (!dev || !dev->request_fn) {
printk(
"ll_rw_block: Trying to read nonexistent block-device %04lX (%ld)\n",
(unsigned long) bh[0]->b_dev, bh[0]->b_blocknr);
goto sorry;
}
/* Determine correct block size for this device. */
correct_size = BLOCK_SIZE;
if (blksize_size[major]) {
i = blksize_size[major][MINOR(bh[0]->b_dev)];
if (i)
correct_size = i;
}
/* Verify requested block sizes. */
for (i = 0; i < nr; i++) {
if (bh[i] && bh[i]->b_size != correct_size) {
printk(
"ll_rw_block: only %d-char blocks implemented (%lu)\n",
correct_size, bh[i]->b_size);
goto sorry;
}
}
if ((rw == WRITE || rw == WRITEA) && is_read_only(bh[0]->b_dev)) {
printk("Can't write to read-only device 0x%X\n",bh[0]->b_dev);
goto sorry;
}
/* If there are no pending requests for this device, then we insert
a dummy request for that device. This will prevent the request
from starting until we have shoved all of the blocks into the
queue, and then we let it rip. */
if (nr > 1)
plug_device(dev, &plug);
for (i = 0; i < nr; i++) {
if (bh[i]) {
bh[i]->b_req = 1;
make_request(major, rw, bh[i]);
if (rw == READ || rw == READA)
kstat.pgpgin++;
else
kstat.pgpgout++;
}
}
unplug_device(dev);
return;
sorry:
for (i = 0; i < nr; i++) {
if (bh[i])
bh[i]->b_dirt = bh[i]->b_uptodate = 0;
}
return;
}
从流程上看,在经过参数的检查、设备信息的检查之后会调用make_request来进行数据的读取。
static void make_request(int major,int rw, struct buffer_head * bh)
{
unsigned int sector, count;
struct request * req;
int rw_ahead, max_req;
/* WRITEA/READA is special case - it is not really needed, so if the */
/* buffer is locked, we just forget about it, else it's a normal read */
rw_ahead = (rw == READA || rw == WRITEA);
if (rw_ahead) {
if (bh->b_lock)
return;
if (rw == READA)
rw = READ;
else
rw = WRITE;
}
if (rw!=READ && rw!=WRITE) {
printk("Bad block dev command, must be R/W/RA/WA\n");
return;
}
count = bh->b_size >> 9;
sector = bh->b_blocknr * count;
if (blk_size[major])
if (blk_size[major][MINOR(bh->b_dev)] < (sector + count)>>1) {
bh->b_dirt = bh->b_uptodate = 0;
bh->b_req = 0;
return;
}
/* Uhhuh.. Nasty dead-lock possible here.. */
if (bh->b_lock)
return;
/* Maybe the above fixes it, and maybe it doesn't boot. Life is interesting */
lock_buffer(bh);
if ((rw == WRITE && !bh->b_dirt) || (rw == READ && bh->b_uptodate)) {
unlock_buffer(bh);
return;
}
/* we don't allow the write-requests to fill up the queue completely:
* we want some room for reads: they take precedence. The last third
* of the requests are only for reads.
*/
max_req = (rw == READ) ? NR_REQUEST : ((NR_REQUEST*2)/3); //判断队列是否满了
/* look for a free request. */
cli();
/* The scsi disk drivers and the IDE driver completely remove the request
* from the queue when they start processing an entry. For this reason
* it is safe to continue to add links to the top entry for those devices.
*/
if (( major == IDE0_MAJOR /* same as HD_MAJOR */
|| major == IDE1_MAJOR
|| major == FLOPPY_MAJOR
|| major == SCSI_DISK_MAJOR
|| major == SCSI_CDROM_MAJOR)
&& (req = blk_dev[major].current_request))
{
#ifdef CONFIG_BLK_DEV_HD
if (major == HD_MAJOR || major == FLOPPY_MAJOR)
#else
if (major == FLOPPY_MAJOR)
#endif CONFIG_BLK_DEV_HD
req = req->next;
while (req) {
if (req->dev == bh->b_dev &&
!req->sem &&
req->cmd == rw &&
req->sector + req->nr_sectors == sector &&
req->nr_sectors < 244)
{
req->bhtail->b_reqnext = bh;
req->bhtail = bh;
req->nr_sectors += count;
mark_buffer_clean(bh);
sti();
return;
}
if (req->dev == bh->b_dev &&
!req->sem &&
req->cmd == rw &&
req->sector - count == sector &&
req->nr_sectors < 244)
{
req->nr_sectors += count;
bh->b_reqnext = req->bh;
req->buffer = bh->b_data;
req->current_nr_sectors = count;
req->sector = sector;
mark_buffer_clean(bh);
req->bh = bh;
sti();
return;
}
req = req->next;
}
}
/* find an unused request. */
req = get_request(max_req, bh->b_dev); //找到一个未使用的请求
sti();
/* if no request available: if rw_ahead, forget it; otherwise try again blocking.. */
if (!req) {
if (rw_ahead) {
unlock_buffer(bh);
return;
}
req = __get_request_wait(max_req, bh->b_dev); // 如果没有则等待获取
}
/* fill up the request-info, and add it to the queue */
req->cmd = rw;
req->errors = 0;
req->sector = sector;
req->nr_sectors = count;
req->current_nr_sectors = count;
req->buffer = bh->b_data;
req->sem = NULL;
req->bh = bh;
req->bhtail = bh;
req->next = NULL;
add_request(major+blk_dev,req); //将请求加入到设备的队列中
}
主要是在经过检查,获取请求之后将请求加入队列中等待执行。
add_reqeust提交给驱动层
/*
* add-request adds a request to the linked list.
* It disables interrupts so that it can muck with the
* request-lists in peace.
*/
static void add_request(struct blk_dev_struct * dev, struct request * req)
{
struct request * tmp;
short disk_index;
switch (MAJOR(req->dev)) {
case SCSI_DISK_MAJOR: disk_index = (MINOR(req->dev) & 0x0070) >> 4;
if (disk_index < 4)
kstat.dk_drive[disk_index]++;
break;
case HD_MAJOR:
case XT_DISK_MAJOR: disk_index = (MINOR(req->dev) & 0x0040) >> 6;
kstat.dk_drive[disk_index]++;
break;
case IDE1_MAJOR: disk_index = ((MINOR(req->dev) & 0x0040) >> 6) + 2;
kstat.dk_drive[disk_index]++;
default: break;
}
req->next = NULL;
cli();
if (req->bh)
mark_buffer_clean(req->bh);
if (!(tmp = dev->current_request)) {
dev->current_request = req;
(dev->request_fn)(); // 回调设备执行请求的方法
sti();
return;
}
for ( ; tmp->next ; tmp = tmp->next) {
if ((IN_ORDER(tmp,req) ||
!IN_ORDER(tmp,tmp->next)) &&
IN_ORDER(req,tmp->next))
break;
}
req->next = tmp->next; //将请求插入队列中
tmp->next = req;
/* for SCSI devices, call request_fn unconditionally */
if (scsi_major(MAJOR(req->dev)))
(dev->request_fn)();
sti();
}
主要是将请求调用设备注册的request_fn方法进行执行。我们以hd为例来进行分析。查询hd设备的reqeust_fn为do_hd_request。继续查询位于drivers/block/hd.c的文件。
static void do_hd_request (void)
{
disable_irq(HD_IRQ);
hd_request();
enable_irq(HD_IRQ);
}
先是禁止中断,然后进行数据的请求。
/*
* The driver enables interrupts as much as possible. In order to do this,
* (a) the device-interrupt is disabled before entering hd_request(),
* and (b) the timeout-interrupt is disabled before the sti().
*
* Interrupts are still masked (by default) whenever we are exchanging
* data/cmds with a drive, because some drives seem to have very poor
* tolerance for latency during I/O. For devices which don't suffer from
* that problem (most don't), the unmask_intr[] flag can be set to unmask
* other interrupts during data/cmd transfers (by defining DEFAULT_UNMASK_INTR
* to 1, or by using "hdparm -u1 /dev/hd?" from the shell).
*/
static void hd_request(void)
{
unsigned int dev, block, nsect, sec, track, head, cyl;
if (CURRENT && CURRENT->dev < 0) return;
if (DEVICE_INTR)
return;
repeat:
timer_active &= ~(1<<HD_TIMER);
sti();
INIT_REQUEST;
if (reset) {
cli();
reset_hd();
return;
}
dev = MINOR(CURRENT->dev);
block = CURRENT->sector;
nsect = CURRENT->nr_sectors;
if (dev >= (NR_HD<<6) || block >= hd[dev].nr_sects || ((block+nsect) > hd[dev].nr_sects)) {
#ifdef DEBUG
if (dev >= (NR_HD<<6))
printk("hd: bad minor number: device=0x%04x\n", CURRENT->dev);
else
printk("hd%c: bad access: block=%d, count=%d\n",
(CURRENT->dev>>6)+'a', block, nsect);
#endif
end_request(0);
goto repeat;
}
block += hd[dev].start_sect;
dev >>= 6;
if (special_op[dev]) {
if (do_special_op(dev))
goto repeat;
return;
}
sec = block % hd_info[dev].sect + 1;
track = block / hd_info[dev].sect;
head = track % hd_info[dev].head;
cyl = track / hd_info[dev].head;
#ifdef DEBUG
printk("hd%c: %sing: CHS=%d/%d/%d, sectors=%d, buffer=0x%08lx\n",
dev+'a', (CURRENT->cmd == READ)?"read":"writ",
cyl, head, sec, nsect, (unsigned long) CURRENT->buffer);
#endif
if (!unmask_intr[dev])
cli();
if (CURRENT->cmd == READ) {
unsigned int cmd = mult_count[dev] > 1 ? WIN_MULTREAD : WIN_READ;
hd_out(dev,nsect,sec,head,cyl,cmd,&read_intr); // 设置读的中断回调 函数
if (reset)
goto repeat;
return;
}
if (CURRENT->cmd == WRITE) {
if (mult_count[dev])
hd_out(dev,nsect,sec,head,cyl,WIN_MULTWRITE,&multwrite_intr);
else
hd_out(dev,nsect,sec,head,cyl,WIN_WRITE,&write_intr);
if (reset)
goto repeat;
if (wait_DRQ()) {
bad_rw_intr();
goto repeat;
}
if (mult_count[dev]) {
WCURRENT = *CURRENT;
multwrite(dev);
} else
outsw(HD_DATA,CURRENT->buffer,256);
return;
}
panic("unknown hd-command");
}
此时由于本文是读场景,在执行到hd_out时,在注册了read_intr回调函数执行便返回。
此时就会是进程执行到ext2_file_read的wait_on_buffer函数处。
/*
* Rewrote the wait-routines to use the "new" wait-queue functionality,
* and getting rid of the cli-sti pairs. The wait-queue routines still
* need cli-sti, but now it's just a couple of 386 instructions or so.
*
* Note that the real wait_on_buffer() is an inline function that checks
* if 'b_wait' is set before calling this, so that the queues aren't set
* up unnecessarily.
*/
void __wait_on_buffer(struct buffer_head * bh)
{
struct wait_queue wait = { current, NULL };
bh->b_count++;
add_wait_queue(&bh->b_wait, &wait);
repeat:
current->state = TASK_UNINTERRUPTIBLE;
if (bh->b_lock) {
schedule();
goto repeat;
}
remove_wait_queue(&bh->b_wait, &wait);
bh->b_count--;
current->state = TASK_RUNNING;
}
此时,读取文件的进程就会进行不可中断的阻塞状态,等待着底层硬件将数据读取完成之后就进行回调通知。
硬件调用read_intr读取文件
static void read_intr(void)
{
unsigned int dev = DEVICE_NR(CURRENT->dev);
int i, retries = 100000, msect = mult_count[dev], nsect;
if (unmask_intr[dev])
sti(); /* permit other IRQs during xfer */
do {
i = (unsigned) inb_p(HD_STATUS);
if (i & BUSY_STAT)
continue;
if (!OK_STATUS(i))
break;
if (i & DRQ_STAT)
goto ok_to_read;
} while (--retries > 0);
dump_status("read_intr", i);
bad_rw_intr();
hd_request();
return;
ok_to_read:
if (msect) {
if ((nsect = CURRENT->current_nr_sectors) > msect)
nsect = msect;
msect -= nsect;
} else
nsect = 1;
insw(HD_DATA,CURRENT->buffer,nsect<<8);
CURRENT->sector += nsect;
CURRENT->buffer += nsect<<9;
CURRENT->errors = 0;
i = (CURRENT->nr_sectors -= nsect);
#ifdef DEBUG
printk("hd%c: read: sectors(%ld-%ld), remaining=%ld, buffer=0x%08lx\n",
dev+'a', CURRENT->sector, CURRENT->sector+nsect,
CURRENT->nr_sectors, (unsigned long) CURRENT->buffer+(nsect<<9));
#endif
if ((CURRENT->current_nr_sectors -= nsect) <= 0) // 在数据请求读取完成之后,调用end_request通知数据已经读取完成
end_request(1);
if (i > 0) {
if (msect)
goto ok_to_read;
SET_INTR(&read_intr);
return;
}
(void) inb_p(HD_STATUS);
#if (HD_DELAY > 0)
last_req = read_timer();
#endif
if (CURRENT)
hd_request();
return;
}
就是操作硬件将对应的数据读取出来,如果处理完成,则调用end_reqeust来通知休眠的进程数据读取成功。
end_request完成请求唤醒进程
#ifdef IDE_DRIVER
static void end_request(byte uptodate, byte hwif) {
struct request *req = ide_cur_rq[HWIF];
#else
static void end_request(int uptodate) {
struct request *req = CURRENT;
#endif /* IDE_DRIVER */
struct buffer_head * bh;
req->errors = 0;
if (!uptodate) {
printk("end_request: I/O error, dev %04lX, sector %lu\n",
(unsigned long)req->dev, req->sector);
req->nr_sectors--;
req->nr_sectors &= ~SECTOR_MASK;
req->sector += (BLOCK_SIZE / 512);
req->sector &= ~SECTOR_MASK;
}
if ((bh = req->bh) != NULL) {
req->bh = bh->b_reqnext;
bh->b_reqnext = NULL;
bh->b_uptodate = uptodate;
if (!uptodate) bh->b_req = 0; /* So no "Weird" errors */
unlock_buffer(bh); //释放锁
if ((bh = req->bh) != NULL) {
req->current_nr_sectors = bh->b_size >> 9;
if (req->nr_sectors < req->current_nr_sectors) {
req->nr_sectors = req->current_nr_sectors;
printk("end_request: buffer-list destroyed\n");
}
req->buffer = bh->b_data;
return;
}
}
#ifdef IDE_DRIVER
ide_cur_rq[HWIF] = NULL;
#else
DEVICE_OFF(req->dev);
CURRENT = req->next;
#endif /* IDE_DRIVER */
if (req->sem != NULL)
up(req->sem);
req->dev = -1;
wake_up(&wait_for_request); //唤醒阻塞该请教的进程来读取数据
}
检查当前的请求是否读取完成,如果完成则调用wake_up来唤醒休眠的进程。
/*
* wake_up doesn't wake up stopped processes - they have to be awakened
* with signals or similar.
*
* Note that this doesn't need cli-sti pairs: interrupts may not change
* the wait-queue structures directly, but only call wake_up() to wake
* a process. The process itself must remove the queue once it has woken.
*/
void wake_up(struct wait_queue **q)
{
struct wait_queue *tmp;
struct task_struct * p;
if (!q || !(tmp = *q))
return;
do {
if ((p = tmp->task) != NULL) {
if ((p->state == TASK_UNINTERRUPTIBLE) ||
(p->state == TASK_INTERRUPTIBLE)) {
p->state = TASK_RUNNING; // 设置进程的状态为可运行状态
if (p->counter > current->counter + 3)
need_resched = 1;
}
}
if (!tmp->next) {
printk("wait_queue is bad (eip = %p)\n",
__builtin_return_address(0));
printk(" q = %p\n",q);
printk(" *q = %p\n",*q);
printk(" tmp = %p\n",tmp);
break;
}
tmp = tmp->next;
} while (tmp != *q);
}
此时进程就会在__wait_on_buffer的goto repeat处继续执行,此时就检查bh->b_lock是否被释放了释放之后就会进行移除queue再讲进程设置为运行状态。
至此,一个读请求的流程就完成。
总结
本文只是简单的基于Linux1.2的ext2简单的学习了解了文件的读取的流程,有关ext2文件系统的整体描述网上有更为详尽的资料可查阅。
https://blog.csdn.net/ac_dao_di/article/details/54606790
https://blog.csdn.net/jmh1996/article/details/90139485
https://blog.csdn.net/XD_hebuters/article/details/79574902
https://blog.csdn.net/XD_hebuters/article/details/79574902
https://blog.csdn.net/Q_AN1314/article/details/79210810
https://yuhao0102.github.io/2020/11/08/%E6%B7%B1%E5%85%A5%E7%90%86%E8%A7%A3Linux%E5%86%85%E6%A0%B8%20%E7%AC%94%E8%AE%B07/
https://zhuanlan.zhihu.com/p/64536225