Given a file descriptor (that keeps the inode and the file position
of the file) we want to read. In fs/read_write.c
we find:
ssize_t sys_read(unsigned int fd, char *buf, size_t count) {
struct file *file = fget(fd);
return file->f_op->read(file, buf, count, &file->f_pos);
}
That is, the read system call asks the file system to do the reading,
starting at the current file position. The f_op
field was
filled in the dentry_open()
routine above with the i_fop
field of an inode.
For romfs the struct file_operations is assigned in
romfs_read_inode()
. For a regular file (case 2)
it assigns generic_ro_fops
. For a block special file
(case 4) it calls init_special_inode()
(see devices.c
)
which assigns def_blk_fops
.
How come romfs_read_inode()
was ever called?
When the filesystem was mounted, the routine romfs_read_super()
was called, and it assigned romfs_ops
to the s_op
field of the superblock struct.
struct super_operations romfs_ops = {
read_inode: romfs_read_inode,
statfs: romfs_statfs,
};
And the iget()
that was skipped over in the discussion above
(in romfs_lookup()
) finds the inode with given number ino
in a cache, and if it cannot be found there creates a new inode struct
by calling get_new_inode()
(see fs/inode.c
):
struct inode *
iget(struct super_block *sb, unsigned long ino) {
struct list_head * head = inode_hashtable + hash(sb,ino);
struct inode *inode = find_inode(sb, ino, head);
if (inode) {
wait_on_inode(inode);
return inode;
}
return get_new_inode(sb, ino, head);
}
struct inode *
get_new_inode(struct super_block *sb, unsigned long ino,
struct list_head *head) {
struct inode *inode = alloc_inode();
inode->i_sb = sb;
inode->i_dev = sb->s_dev;
inode->i_ino = ino;
...
sb->s_op->read_inode(inode);
}
So that is how the inode was filled, and we find that in our case
(/dev/hda is a block special file) the routine that is called by
sys_read
is def_blk_fops.read
, and inspection of
block_dev.c
shows that that is the routine block_read()
:
ssize_t
block_read(struct file *filp, char *buf, size_t count, loff_t *ppos) {
struct inode *inode = filp->f_dentry->d_inode;
kdev_t dev = inode->i_rdev;
ssize_t blocksize = blksize_size[MAJOR(dev)][MINOR(dev)];
loff_t offset = *ppos;
ssize_t read = 0;
size_t left, block, blocks;
struct buffer_head *bhreq[NBUF];
struct buffer_head *buflist[NBUF];
struct buffer_head **bh;
left = count; /* bytes to read */
block = offset / blocksize; /* first block */
offset &= (blocksize-1); /* starting offset in block */
blocks = (left + offset + blocksize - 1) / blocksize;
bh = buflist;
do {
while (blocks) {
--blocks;
*bh = getblk(dev, block++, blocksize);
if (*bh && !buffer_uptodate(*bh))
bhreq[bhrequest++] = *bh;
}
if (bhrequest)
ll_rw_block(READ, bhrequest, bhreq);
/* wait for I/O to complete,
copy result to user space,
increment read and *ppos, decrement left */
} while (left > 0);
return read;
}
So the building blocks here are getblk()
,
ll_rw_block()
, and wait_on_buffer()
.
The first of these lives in fs/buffer.c
.
It finds the buffer that already contains the required data
if we are lucky, and otherwise a buffer that is going to be used.
struct buffer_head *
getblk(kdev_t dev, int block, int size) {
struct buffer_head *bh;
int isize;
try_again:
bh = __get_hash_table(dev, block, size);
if (bh)
return bh;
isize = BUFSIZE_INDEX(size);
bh = free_list[isize].list;
if (bh) {
__remove_from_free_list(bh);
init_buffer(bh);
bh->b_dev = dev;
bh->b_blocknr = block;
...
return bh;
}
refill_freelist(size);
goto try_again;
}
The real I/O is started by ll_rw_block()
.
It lives in drivers/block/ll_rw_blk.c
.
ll_rw_block(int rw, int nr, struct buffer_head * bhs[]) {
int i;
for (i = 0; i < nr; i++) {
struct buffer_head *bh = bhs[i];
bh->b_end_io = end_buffer_io_sync;
submit_bh(rw, bh);
}
}
Here bh->b_end_io
specifies what to do when I/O is finished.
In this case:
end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
mark_buffer_uptodate(bh, uptodate);
unlock_buffer(bh);
}
So, ll_rw_block()
just feeds the requests it gets one by one
to submit_bh()
:
submit_bh(int rw, struct buffer_head *bh) {
bh->b_rdev = bh->b_dev;
bh->b_rsector = bh->b_blocknr * (bh->b_size >> 9);
generic_make_request(rw, bh);
}
So, submit_bh()
just passes things along to
generic_make_request()
, the routine to send I/O requests
to block devices:
generic_make_request (int rw, struct buffer_head *bh) {
request_queue_t *q;
q = blk_get_queue(bh->b_rdev);
q->make_request_fn(q, rw, bh);
}
Thus, it finds the right queue and calls the request function for that queue.
struct blk_dev_struct {
request_queue_t request_queue;
queue_proc *queue;
void *data;
} blk_dev[MAX_BLKDEV];
request_queue_t *blk_get_queue(kdev_t dev)
{
return blk_dev[MAJOR(dev)].queue(dev);
}
In our case (/dev/hda), the blk_dev
struct was filled
by hwif_init
(from drivers/ide/ide-probe.c
):
blk_dev[hwif->major].data = hwif;
blk_dev[hwif->major].queue = ide_get_queue;
and this ide_get_queue()
is found in drivers/ide/ide.c
:
#define DEVICE_NR(dev) (MINOR(dev) >> PARTN_BITS)
request_queue_t *ide_get_queue (kdev_t dev) {
ide_hwif_t *hwif = (ide_hwif_t *) blk_dev[MAJOR(dev)].data;
return &hwif->drives[DEVICE_NR(dev) & 1].queue;
}
This .queue
field was filled by ide_init_queue()
:
ide_init_queue(ide_drive_t *drive) {
request_queue_t *q = &drive->queue;
q->queuedata = HWGROUP(drive);
blk_init_queue(q, do_ide_request);
}
And blk_init_queue()
(from ll_rw_blk.c
again):
blk_init_queue(request_queue_t *q, request_fn_proc *rfn) {
...
q->request_fn = rfn;
q->make_request_fn = __make_request;
q->merge_requests_fn = ll_merge_requests_fn;
...
}
Aha, so we found the q->make_request_fn
.
Here it is:
__make_request(request_queue_t *q, int rw, struct buffer_head *bh) {
/* try to merge request with adjacent ones */
...
/* get a struct request and fill it with device, start, length, ... */
...
add_request(q, req, insert_here);
if (!q->plugged)
q->request_fn(q);
}
add_request(request_queue_t *q, struct request *req,
struct list_head *insert_here) {
list_add(&req->queue, insert_here);
}
When the request has been queued, q->request_fn
is called.
What is that? We can see it above - it is do_ide_request()
and lives in ide.c
.
do_ide_request(request_queue_t *q) {
ide_do_request(q->queuedata, 0);
}
ide_do_request(ide_hwgroup_t *hwgroup, int masked_irq) {
ide_startstop_t startstop;
while (!hwgroup->busy) {
hwgroup->busy = 1;
drive = choose_drive(hwgroup);
startstop = start_request(drive);
if (startstop == ide_stopped)
hwgroup->busy = 0;
}
}
ide_startstop_t
start_request (ide_drive_t *drive) {
unsigned long block, blockend;
struct request *rq;
rq = blkdev_entry_next_request(&drive->queue.queue_head);
block = rq->sector;
block += drive->part[minor & PARTN_MASK].start_sect;
SELECT_DRIVE(hwif, drive);
return (DRIVER(drive)->do_request(drive, rq, block));
}
So, in the case of a partitioned disk it is only at this very low level that we add in the starting sector of the partition in order to get an absolute sector.
The first actual port access happened already:
#define SELECT_DRIVE(hwif,drive) \
OUT_BYTE((drive)->select.all, hwif->io_ports[IDE_SELECT_OFFSET]);
but this do_request
function must do the rest.
For a disk it is defined in ide-disk.c
, in the
ide_driver_t idedisk_driver
, and the function turns out
to be do_rw_disk()
.
ide_startstop_t
do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block) {
if (IDE_CONTROL_REG)
OUT_BYTE(drive->ctl,IDE_CONTROL_REG);
OUT_BYTE(rq->nr_sectors,IDE_NSECTOR_REG);
if (drive->select.b.lba) {
OUT_BYTE(block,IDE_SECTOR_REG);
OUT_BYTE(block>>=8,IDE_LCYL_REG);
OUT_BYTE(block>>=8,IDE_HCYL_REG);
OUT_BYTE(((block>>8)&0x0f)|drive->select.all,IDE_SELECT_REG);
} else {
unsigned int sect,head,cyl,track;
track = block / drive->sect;
sect = block % drive->sect + 1;
OUT_BYTE(sect,IDE_SECTOR_REG);
head = track % drive->head;
cyl = track / drive->head;
OUT_BYTE(cyl,IDE_LCYL_REG);
OUT_BYTE(cyl>>8,IDE_HCYL_REG);
OUT_BYTE(head|drive->select.all,IDE_SELECT_REG);
}
if (rq->cmd == READ) {
ide_set_handler(drive, &read_intr, WAIT_CMD, NULL);
OUT_BYTE(WIN_READ, IDE_COMMAND_REG);
return ide_started;
}
...
}
This fills the remaining control registers of the interface and starts
the actual I/O. Now ide_set_handler()
sets up
read_intr()
to be called when we get an interrupt.
This calls ide_end_request()
when a request is done,
which calls end_that_request_first()
(which calls
bh->b_end_io()
as promised earlier) and
end_that_request_last()
which calls
blkdev_release_request()
which wakes up whoever waited
for the block.