You are on page 1of 32

Linux: Files and File Systems

Gregory Kesden 15-412 Fall ‘00


Linux Virtual File System Overview

Rusling, David A, The Linux Kernel, V0.8-3, LDP, 1999, S.9.2.


Major Data Structures (1/2)
• struct files_struct – per process table
• struct file_system_type – represents and entire
file system
• struct super_block – represents super block
(metadata) of file system
• struct super_operations – operations to
manipulate the super-block (metadata)
• struct inode – represents a file
Major Data Structures (2/2)
• struct inode_operations – operations to
manipulate contents of an inode
• struct dentry – represents a name to inode
mapping
• struct file – entry in open file table; represents
state of an open file
• struct file_operations – collection of operations
that can be performed on an open file (remember
this from device drivers?)
Per Process File Information
include/linux/sched.h:
struct files_struct { /* kept within task_struct (PCB) */
atomic_t count;
rwlock_t file_lock;
int max_fds;
int max_fdset;
int next_fd;
struct file ** fd; /* current fd array */
fd_set *close_on_exec;
fd_set *open_fds;
fd_set close_on_exec_init;
fd_set open_fds_init;
struct file * fd_array[NR_OPEN_DEFAULT];
};
System-wide File Information

List of files in use: (struct list_head) sb->s_files


• One such list exists per file system. It holds the file
structures for open files.

List of free files: struct list_head free_list


• This is a system-wide list. It holds file descriptors that are
no longer used. Think of it as a big recycle bin.

List of newly created files: struct list_head anon_list


• This list holds newly created file structs. They are added to
this list in response to an open that couldn’t be satsified
with the free_list.
struct file (1/2)
struct file {
struct list_head f_list;
struct dentry *f_dentry;
struct file_operations *f_op;
atomic_t f_count;
unsigned int f_flags;
mode_t f_mode;
loff_t f_pos;
unsigned long f_reada, f_ramax, f_raend, f_ralen,
f_rawin;
struct file (2/2)
struct fown_struct f_owner;
unsigned int f_uid, f_gid;
int f_error;

unsigned long f_version;

/* needed for tty driver, and maybe others */


void *private_data;
};
struct file_operations (1/2)
struct file_operations {
loff_t (*llseek) (struct file *, loff_t, int);
ssize_t (*read) (struct file *, char *, size_t, loff_t *);
ssize_t (*write) (struct file *, const char *, size_t, loff_t *);
int (*readdir) (struct file *, void *, filldir_t);
unsigned int (*poll) (struct file *, struct poll_table_struct *);
int (*ioctl) (struct inode *, struct file *, unsigned int,
unsigned long );
struct file_operations (2/2)
int (*mmap) (struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *);
int (*release) (struct inode *, struct file *);
int (*fsync) (struct file *, struct dentry *);
int (*fasync) (int, struct file *, int);
int (*check_media_change) (kdev_t dev);
int (*revalidate) (kdev_t dev);
int (*lock) (struct file *, int, struct file_lock *);
};
struct inode (1/2)
struct inode {
struct list_head i_hash;
struct list_head i_list;
struct list_head i_dentry;

unsigned long i_ino;


kdev_t i_dev;

/* Usual metadata, such as might be seen with “ls –


l” */

struct inode (2/2)
struct inode_operations *i_op;
struct super_block *i_sb;
wait_queue_head_t i_wait;
struct vm_area_struct *i_mmap;
struct pipe_inode_info *i_pipe;
union {
struct minix_inode_info minix_i;
struct ext2_inode_info ext2_i;

} u;
}
Memory Mapping (1/2)
struct vm_area_struct
{
struct mm_struct * vm_mm; /* VM area parameters */
/* linked list of VM areas per task, sorted by address */
unsigned long vm_start; unsigned long vm_end;
struct vm_area_struct *vm_next;
pgprot_t vm_page_prot;
/* AVL tree of VM areas per task, sorted by address */
unsigned short vm_flags;
Memory Mapping (2/2)
short vm_avl_height;
struct vm_area_struct * vm_avl_left;
struct vm_area_struct * vm_avl_right;
/* For areas with inode, the list inode->i_mmap, for shm areas, *
the list of attaches, otherwise unused. */
struct vm_area_struct *vm_next_share;
struct vm_area_struct **vm_pprev_share;
struct vm_operations_struct * vm_ops;
unsigned long vm_offset;
struct file * vm_file; void * vm_private_data; /* was vm_pte
};
Memory Mapping Operations
struct vm_operations_struct {
void (*open)(struct vm_area_struct * area);
void (*close)(struct vm_area_struct * area);
void (*unmap)(struct vm_area_struct *area, unsigned long, size_t);
void (*protect)(struct vm_area_struct *area, unsigned long, size_t,
unsigned int newprot);
int (*sync)(struct vm_area_struct *area, unsigned long, size_t,
unsigned int flags);
void (*advise)(struct vm_area_struct *area, unsigned long, size_t,
unsigned int advise);
unsigned long (*nopage)(struct vm_area_struct * area,
unsigned long address, int write_access);
unsigned long (*wppage)(struct vm_area_struct * area,
unsigned long address, unsigned long page);
int (*swapout)(struct vm_area_struct *, struct page *);
};
Inode Cache

Inode_hashtable
sb->clear_inode sb->read_inode
(freeing inos) (iget)
or
sb->delete_inode Fs storage
(iput) Unused inodes
Fs storage

Dirty inodes sb->write_inode


media fs only (sync one)
(mark_inode_dirty) Used inodes Fs storage

Adapted from Linux Virtual File System Presentation © Peter J Bramm/CMU


struct dentry (1/2)
struct dentry {

int d_count;
unsigned int d_flags;
struct inode * d_inode; /* Where the name belongs to */
struct dentry * d_parent; /* parent directory */
struct dentry * d_mounts; /* mount information */
struct dentry * d_covers;
struct list_head d_hash; /* lookup hash list */
struct list_head d_lru; /* d_count = 0 LRU list */
struct list_head d_child; /* child of parent list */
struct list_head d_subdirs; /* our children */

struct dentry (2/2)
struct list_head d_alias; /* inode alias list */
struct qstr d_name;
unsigned long d_time; /* used by d_revalidate */
struct dentry_operations *d_op;
struct super_block * d_sb; /* The root of the dentry tree */
unsigned long d_reftime; /* last time referenced */
void * d_fsdata; /* fs-specific data */

/* small names */
unsigned char d_iname[DNAME_INLINE_LEN];
};
The Buffer Cache

Rusling, David A, The Linux Kernel, V0.8-3, LDP, 1999, S.9.3.


dentry_operations

struct dentry_operations {
int (*d_revalidate)(struct dentry *, int);
int (*d_hash) (struct dentry *, struct qstr *);
int (*d_compare) (struct dentry *, struct qstr *,

struct qstr *);


void (*d_delete)(struct dentry *);
void (*d_release)(struct dentry *);
};
Dcache

Replacement:
dentry_hashtable (hash chains)
Dentry->hash(device #, name) list head level1_cache/level1_head
• LRU list of recently translated
entries. Entries added to the end
may displace older entries if
cache is full.
prune namei level_2_cache/level2_head
Dentry->invalidate inode->lookup • LRU list of recently accessed
dentry->drop dentry->add entries (moved from level 1 on
second access).
Level 2 is safer – only displaced by
repeatedly accessed entry, not
unused dentries (dentry->lru chains) just a new entry.

Adapted from Linux Virtual File System Presentation © Peter J Bramm/CMU


struct file_system_type

struct file_system_type {
const char *name;
int fs_flags;
struct super_block * (*read_super)
(struct super_block *, void *, int);
struct file_system_type * next;
};
struct super_block (1/2)
struct super_block {
struct list_head s_list; /* Keep this first */
kdev_t s_dev;
unsigned long s_blocksize;
unsigned char s_lock;
unsigned char s_rd_only;
unsigned char s_dirt;
struct inode *s_ibasket;
short int s_ibasket_count;
struct super_block (2/2)
short int s_ibasket_max;
struct list_head s_dirty; /* dirty inodes */
struct list_head s_files;

union {
struct minix_sb_info minix_sb;
struct ext2_sb_info ext2_sb;
struct hpfs_sb_info hpfs_sb;
….
} u;
}
struct super_operations
struct super_operations {
void (*read_inode) (struct inode *);
void (*write_inode) (struct inode *);
void (*put_inode) (struct inode *);
void (*delete_inode) (struct inode *);
int (*notify_change) (struct dentry *, struct iattr *);
void (*put_super) (struct super_block *);
void (*write_super) (struct super_block *);
int (*statfs) (struct super_block *, struct statfs *, int);
int (*remount_fs) (struct super_block *, int *, char *);
void (*clear_inode) (struct inode *);
void (*umount_begin) (struct super_block *);
};
The Buffer Cache

Two main parts:


• Lists of empty buffers of several sizes: 512B, 1K, 2K, 4K, 8K
• Open-chaining has table of block buffers: hash (device #,
block #) is index

Properties
• Block buffers are either in a free list or in the hash table
• All block buffers are also kept in an LRU list for
replacement
The Buffer Cache

Rusling, David A, The Linux Kernel, V0.8-3, LDP, 1999, S.9.3.


Victim Selection
Each block buffer is maintained on one of the
following LRU lists:
• BUF_CLEAN
• BUF_UNSHARED
• BUF_SHARED
• BUF_LOCKED – scheduled to be flushed
• BUF_LOCKED1 – super block and inode buffers that can’t
be flushed
• BUF_DIRTY

The victim is the best clean buffer. If a victim can’t


be found, the system will try to create more
buffers. If that fails, it will try to free block buffers
of ofther sizes and try again.
The bdflush Kernel Daemon

The bdflush daemon flushes dirty blocks creating


clean blocks.
It normally sleeps, but wakes up:
• If the system runs out of clean buffers
• More than 60% (configurable) of the buffers are dirty
struct buffer_head (1/3)
struct buffer_head {
/* First cache line: */
struct buffer_head *b_next; /* Hash queue list */
unsigned long b_blocknr; /* block number */
unsigned short b_size; /* block size */
unsigned short b_list; /* List that this buffer appears */
kdev_t b_dev; /* device (B_FREE = free) */

atomic_t b_count; /* users using this block */


kdev_t b_rdev; /* Real device */
struct buffer_head (2/3)
unsigned long b_state; /* buffer state bitmap (see above) */
unsigned long b_flushtime; /* Time to write (dirty) buffer */
struct buffer_head *b_next_free;/* lru/free list linkage */
struct buffer_head *b_prev_free;/* doubly linked list of buffers */
struct buffer_head *b_this_page;/* circular list of buffers in one

page */
struct buffer_head *b_reqnext; /* request queue */
struct buffer_head (3/3)
struct buffer_head **b_pprev; /* 2x linked list of hash-queue */
char *b_data; /* pointer to data block (1024 bytes) */
void (*b_end_io)(struct buffer_head *bh, int uptodate);
/* I/O completion */
void *b_dev_id;
unsigned long b_rsector; /* Real buffer location on disk */
wait_queue_head_t b_wait;
struct kiobuf * b_kiobuf; /* kiobuf which owns this IO */
};

You might also like