Next Previous Contents

2. open

The open system call is found in fs/open.c:

int sys_open(const char *filename, int flags, int mode) {
        char *tmp = getname(filename);
        int fd = get_unused_fd();
        struct file *f = filp_open(tmp, flags, mode);
        fd_install(fd, f);
        putname(tmp);
        return fd;
}

The routine getname() is found in fs/namei.c. It copies the file name from user space to kernel space:

#define __getname()     kmem_cache_alloc(names_cachep, SLAB_KERNEL)
#define putname(name)   kmem_cache_free(names_cachep, (void *)(name))

char *getname(const char *filename) {
        char *tmp = __getname();        /* allocate some memory */
        strncpy_from_user(tmp, filename, PATH_MAX + 1);
        return tmp;
}

The routine get_unused_fd() is found in fs/open.c again. It returns the first unused filedescriptor:

int get_unused_fd(void) {
        struct files_struct *files = current->files;
        int fd = find_next_zero_bit(files->open_fds,
                                    files->max_fdset, files->next_fd);
        FD_SET(fd, files->open_fds);    /* in use now */
        files->next_fd = fd + 1;
        return fd;
}

Here current is the pointer to the user task struct for the currently executing task.

The routine fd_install() is found in include/linux/file.h. It just stores the information returned by filp_open()

void fd_install(unsigned int fd, struct file *file) {
        struct files_struct *files = current->files;
        files->fd[fd] = file;
}

So all the interesting work of sys_open() is done in filp_open(). This routine is found in fs/open.c:

struct file *filp_open(const char *filename, int flags, int mode) {
        struct nameidata nd;
        open_namei(filename, flags, mode, &nd);
        return dentry_open(nd.dentry, nd.mnt, flags);
}

The struct nameidata is defined in include/linux/fs.h. It is used during lookups.

struct nameidata {
        struct dentry *dentry;
        struct vfsmount *mnt;
        struct qstr last;
};

The routine open_namei() is found in fs/namei.c:

open_namei(const char *pathname, int flag, int mode, struct nameidata *nd) {
        if (!(flag & O_CREAT)) {
                /* The simplest case - just a plain lookup. */
                if (*pathname == '/') {
                        nd->mnt = mntget(current->fs->rootmnt);
                        nd->dentry = dget(current->fs->root);
                } else {
                        nd->mnt = mntget(current->fs->pwdmnt);
                        nd->dentry = dget(current->fs->pwd);
                }
                path_walk(pathname, nd);
                /* Check permissions etc. */
                ...
                return 0;
        }
        ...
}

An inode (index node) describes a file. A file can have several names (or no name at all), but it has a unique inode. A dentry (directory entry) describes a name of a file: the inode plus the pathname used to find it. A vfsmount describes the filesystem we are in.

So, essentially, the lookup part of open_namei() is found in path_walk():

path_walk(const char *name, struct nameidata *nd) {
        struct dentry *dentry;
        for(;;) {
                struct qstr this;
                this.name = next_part_of(name);
                this.len = length_of(this.name);
                this.hash = hash_fn(this.name);
                /* if . or .. then special, otherwise: */
                dentry = cached_lookup(nd->dentry, &this);
                if (!dentry)
                        dentry = real_lookup(nd->dentry, &this);
                nd->dentry = dentry;
                if (this_was_the_final_part)
                        return;
        }
}

Here the cached_lookup() tries to find the given dentry in a cache of recently used dentries. If it is not found, the real_lookup() goes to the filesystem, which probably goes to disk, and actually finds the thing. After path_walk() is done, the nd argument contains the required dentry, which in turn has the inode information on the file. Finally we do dentry_open() that initializes a file struct:

struct file *
dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) {
        struct file *f = get_empty_filp();
        f->f_dentry = dentry;
        f->f_vfsmnt = mnt;
        f->f_pos = 0;
        f->f_op = dentry->d_inode->i_fop;
        ...
        return f;
}

So far the open. In short: walk the tree, for each component hope the information is in cache, and if not ask the file system. How does this work? Each file system type provides structs super_operations, file_operations, inode_operations, address_space_operations that contain the addresses of the routines that can do stuff. And thus

struct dentry *
real_lookup(struct dentry *parent, struct qstr *name, int flags) {
        struct dentry *dentry = d_alloc(parent, name);
        parent->d_inode->i_op->lookup(dir, dentry);
        return dentry;
}

calls on the lookup routine for the specific fiilesystem, as found in the struct inode_operations in the inode of the dentry for the directory in which we do the lookup.

And this file system specific routine must read the disk data and search the directory for the file we are looking for. Good examples of file systems are minix and romfs because they are simple and small. For example, in fs/romfs/inode.c:

romfs_lookup(struct inode *dir, struct dentry *dentry) {
        const char *name = dentry->d_name.name;
        int len = dentry->d_name.len;
        char fsname[ROMFS_MAXFN];
        struct romfs_inode ri;
        unsigned long offset = dir->i_ino & ROMFH_MASK;
        for (;;) {
                romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE);
                romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
                if (strncmp (name, fsname, len) == 0)
                        break;
                /* next entry */
                offset = ntohl(ri.next) & ROMFH_MASK;
        }
        inode = iget(dir->i_sb, offset);
        d_add (dentry, inode);
        return 0;
}

romfs_copyfrom(struct inode *i, void *dest,
               unsigned long offset, unsigned long count) {
        struct buffer_head *bh;

        bh = bread(i->i_dev, offset>>ROMBSBITS, ROMBSIZE);
        memcpy(dest, ((char *)bh->b_data) + (offset & ROMBMASK), count);
        brelse(bh);
}

(All complications, all locking, and all error handling deleted.)


Next Previous Contents