This is the mail archive of the
systemtap@sourceware.org
mailing list for the systemtap project.
Enhancements to block IO & IO scheduler tapset
- From: Prerna Saxena <prerna at linux dot vnet dot ibm dot com>
- To: systemtap at sourceware dot org
- Date: Mon, 12 Oct 2009 13:59:18 +0530
- Subject: Enhancements to block IO & IO scheduler tapset
Hi,
Here is a patch to add tracepoint-based probes to block IO and IO
scheduler tapsets.
I had to add new probe aliases instead of adding fallbacks to existing
probes because the tracepoint that flags an event, say ,
elv_add_request, is not defined at function entry but somewhere in the
interior. So an equivalent kprobe based probe mapping to the same point
would need to be a statement probe, which I didnt think to be a
scalable design choice.
Also, the handlers for a lot of probe aliases are duplicated -- they
essentially expose the same set of local variables available in the
probe. Is there some way I could reduce duplication here ? I tried to
define multiple probe aliases to use the same probe definition :
Example,
probe ioscheduler_trace.plug = kernel.trace("block_plug"),
probe ioscheduler_trace.unplug_io = kernel.trace("block_unplug_io")
{
..do something..
}
But the systemtap translator doesnt seem to support that atm. Would this
be a good-to-have language enhancement? Or, would a common initializing
function be useful here that could be called for each probe alias?
Looking fwd to feedback...
--
Prerna Saxena
Linux Technology Centre,
IBM Systems and Technology Lab,
Bangalore, India
Index: stap-git-Oct-01/tapset/ioscheduler.stp
===================================================================
--- stap-git-Oct-01.orig/tapset/ioscheduler.stp
+++ stap-git-Oct-01/tapset/ioscheduler.stp
@@ -142,6 +142,174 @@ probe ioscheduler.elv_completed_request
%)
}
+/**
+ * probe ioscheduler_trace.elv_add_request : Indicates a request is added
+ * to the request queue.
+ * @elevator_name : The type of I/O elevator currently enabled.
+ * @rq : Address of request.
+ * @rq_flags : Request flags.
+ * @disk_major : Disk major no of request.
+ * @disk_minor : Disk minor number of request.
+ *
+ */
+probe ioscheduler_trace.elv_add_request
+ = kernel.trace("block_rq_insert")
+{
+elevator_name = kernel_string($q->elevator->elevator_type->elevator_name)
+rq = $rq
+
+if ($rq == 0 || $rq->rq_disk ==0) {
+ disk_major = -1
+ disk_minor = -1
+} else {
+ disk_major = $rq->rq_disk->major
+ disk_minor = $rq->rq_disk->first_minor
+}
+
+rq_flags = $rq==0? 0:$rq->cmd_flags
+}
+
+/**
+ * probe ioscheduler_trace.elv_completed_request : Fires when a request is
+ * completed.
+ * @elevator_name : The type of I/O elevator currently enabled.
+ * @rq : Address of request.
+ * @rq_flags : Request flags.
+ * @disk_major : Disk major no of request.
+ * @disk_minor : Disk minor number of request.
+ *
+ */
+probe ioscheduler_trace.elv_completed_request
+ = kernel.trace("block_rq_complete")
+{
+elevator_name = kernel_string($q->elevator->elevator_type->elevator_name)
+rq = $rq
+
+if ($rq == 0 || $rq->rq_disk ==0) {
+ disk_major = -1
+ disk_minor = -1
+} else {
+ disk_major = $rq->rq_disk->major
+ disk_minor = $rq->rq_disk->first_minor
+}
+
+rq_flags = $rq==0? 0:$rq->cmd_flags
+}
+
+/**
+ * probe ioscheduler_trace.elv_issue_request : Fires when a request is
+ * scheduled.
+ * @elevator_name : The type of I/O elevator currently enabled.
+ * @rq : Address of request.
+ * @rq_flags : Request flags.
+ * @disk_major : Disk major no of request.
+ * @disk_minor : Disk minor number of request.
+ *
+ */
+probe ioscheduler_trace.elv_issue_request
+ = kernel.trace("block_rq_issue")
+{
+elevator_name = kernel_string($q->elevator->elevator_type->elevator_name)
+rq = $rq
+
+if ($rq == 0 || $rq->rq_disk ==0) {
+ disk_major = -1
+ disk_minor = -1
+} else {
+ disk_major = $rq->rq_disk->major
+ disk_minor = $rq->rq_disk->first_minor
+}
+
+rq_flags = $rq==0? 0:$rq->cmd_flags
+}
+
+/**
+ * probe ioscheduler_trace.elv_requeue_request : Fires when a request is
+ * put back on the queue, when the hadware cannot accept more requests.
+ * @elevator_name : The type of I/O elevator currently enabled.
+ * @rq : Address of request.
+ * @rq_flags : Request flags.
+ * @disk_major : Disk major no of request.
+ * @disk_minor : Disk minor number of request.
+ *
+ */
+probe ioscheduler_trace.elv_requeue_request
+ = kernel.trace("block_rq_requeue")
+{
+elevator_name = kernel_string($q->elevator->elevator_type->elevator_name)
+rq = $rq
+
+if ($rq == 0 || $rq->rq_disk ==0) {
+ disk_major = -1
+ disk_minor = -1
+} else {
+ disk_major = $rq->rq_disk->major
+ disk_minor = $rq->rq_disk->first_minor
+}
+
+rq_flags = $rq==0? 0:$rq->cmd_flags
+}
+
+/**
+ * probe ioscheduler_trace.elv_abort_request : Fires when a request is aborted.
+ * @elevator_name : The type of I/O elevator currently enabled.
+ * @rq : Address of request.
+ * @rq_flags : Request flags.
+ * @disk_major : Disk major no of request.
+ * @disk_minor : Disk minor number of request.
+ *
+ */
+probe ioscheduler_trace.elv_abort_request
+ = kernel.trace("block_rq_abort")
+{
+elevator_name = kernel_string($q->elevator->elevator_type->elevator_name)
+rq = $rq
+
+if ($rq == 0 || $rq->rq_disk ==0) {
+ disk_major = -1
+ disk_minor = -1
+} else {
+ disk_major = $rq->rq_disk->major
+ disk_minor = $rq->rq_disk->first_minor
+}
+
+rq_flags = $rq==0? 0:$rq->cmd_flags
+}
+
+/**
+ * probe ioscheduler_trace.plug - Fires when a request queue is plugged;
+ * ie, requests in the queue cannot be serviced by block driver.
+ * @rq_queue : request queue
+ *
+ */
+probe ioscheduler_trace.plug = kernel.trace("block_plug")
+{
+ rq_queue = $q
+}
+
+/**
+ * probe ioscheduler_trace.unplug_io - Fires when a request queue is unplugged;
+ * Either, when number of pending requests in the queue exceeds threshold
+ * or, upon expiration of timer that was activated when queue was plugged.
+ * @rq_queue : request queue
+ *
+ */
+probe ioscheduler_trace.unplug_io = kernel.trace("block_unplug_io")
+{
+ rq_queue = $q
+}
+
+/**
+ * probe ioscheduler_trace.unplug_timer - Fires when unplug timer associated
+ * with a request queue expires.
+ * @rq_queue : request queue
+ *
+ */
+probe ioscheduler_trace.unplug_timer = kernel.trace("block_unplug_timer")
+{
+ rq_queue = $q
+}
+
function disk_major_from_request:long(var_q:long)
%{ /* pure */
struct request_queue *q = (struct request_queue *)((long)THIS->var_q);
Index: stap-git-Oct-01/tapset/ioblock.stp
===================================================================
--- stap-git-Oct-01.orig/tapset/ioblock.stp
+++ stap-git-Oct-01/tapset/ioblock.stp
@@ -186,3 +186,156 @@ probe ioblock.end = kernel.function("bio
%)
size = $bio->bi_size
}
+
+/**
+ * probe ioblock_trace.bounce
+ *
+ * Fires whenever a buffer bounce is needed for at least one page of
+ * a block IO request.
+ *
+ * Context :
+ * The process creating a block IO request.
+ *
+ * Variables :
+ * $bio struct bio *
+ * $q struct request_queue*
+ * devname device for which a buffer bounce was needed.
+ * ino - i-node number of the mapped file
+ * byte_done - number of bytes transferred
+ * sector - beginning sector for the entire bio
+ * flags - see below
+ * BIO_UPTODATE 0 ok after I/O completion
+ * BIO_RW_BLOCK 1 RW_AHEAD set, and read/write would block
+ * BIO_EOF 2 out-out-bounds error
+ * BIO_SEG_VALID 3 nr_hw_seg valid
+ * BIO_CLONED 4 doesn't own data
+ * BIO_BOUNCED 5 bio is a bounce bio
+ * BIO_USER_MAPPED 6 contains user pages
+ * BIO_EOPNOTSUPP 7 not supported
+ * error - 0 on success
+ * rw - binary trace for read/write request
+ * vcnt - bio vector count which represents number of array element (page,
+ * offset, length) which makes up this I/O request
+ * idx - offset into the bio vector array
+ * phys_segments - number of segments in this bio after physical address
+ * coalescing is performed.
+ * size - total size in bytes
+ */
+probe ioblock_trace.bounce = kernel.trace("block_bio_bounce")
+{
+ devname = __bio_devname($bio)
+ ino = __bio_ino($bio)
+
+ bytes_done = $bio->bi_size
+ sector = $bio->bi_sector
+ flags = $bio->bi_flags
+ rw = $bio->bi_rw
+ vcnt = $bio->bi_vcnt
+ idx = $bio->bi_idx
+ phys_segments = $bio->bi_phys_segments
+ size = $bio->bi_size
+}
+
+/* probe ioblock_trace.request
+ *
+ * Fires just as a generic block I/O request is created for a bio.
+ *
+ * Context:
+ * The process makes block I/O request
+ *
+ * Variables:
+ * $bio struct bio* for which IO request is to be submitted
+ * $q struct request_queue* to which the request is to be added
+ * devname - block device name
+ * ino - i-node number of the mapped file
+ * sector - beginning sector for the entire bio
+ * flags - see below
+ * BIO_UPTODATE 0 ok after I/O completion
+ * BIO_RW_BLOCK 1 RW_AHEAD set, and read/write would block
+ * BIO_EOF 2 out-out-bounds error
+ * BIO_SEG_VALID 3 nr_hw_seg valid
+ * BIO_CLONED 4 doesn't own data
+ * BIO_BOUNCED 5 bio is a bounce bio
+ * BIO_USER_MAPPED 6 contains user pages
+ * BIO_EOPNOTSUPP 7 not supported
+ *
+ * rw - binary trace for read/write request
+ * vcnt - bio vector count which represents number of array element (page,
+ * offset, length) which make up this I/O request
+ * idx - offset into the bio vector array
+ * phys_segments - number of segments in this bio after physical address
+ * coalescing is performed.
+ * size - total size in bytes
+ * bdev - target block device
+ * bdev_contains - points to the device object which contains the
+ * partition (when bio structure represents a partition)
+ * p_start_sect - points to the start sector of the partition
+ * structure of the device
+ */
+
+probe ioblock_trace.request = kernel.trace("block_bio_queue")
+{
+ devname = __bio_devname($bio)
+ ino = __bio_ino($bio)
+
+ bytes_done = $bio->bi_size
+ error = $error
+ sector = $bio->bi_sector
+ flags = $bio->bi_flags
+ rw = $bio->bi_rw
+ vcnt = $bio->bi_vcnt
+ idx = $bio->bi_idx
+ phys_segments = $bio->bi_phys_segments
+ size = $bio->bi_size
+ bdev_contains = $bio->bi_bdev->bd_contains
+ bdev = $bio->bi_bdev
+ p_start_sect = __bio_start_sect($bio)
+}
+
+/* probe ioblock_trace.end
+ *
+ * Fires whenever a block I/O transfer is complete.
+ *
+ * Context:
+ * The process signals the transfer is done.
+ *
+ * Variables:
+ * devname - block device name
+ * ino - i-node number of the mapped file
+ * byte_done - number of bytes transferred
+ * sector - beginning sector for the entire bio
+ * flags - see below
+ * BIO_UPTODATE 0 ok after I/O completion
+ * BIO_RW_BLOCK 1 RW_AHEAD set, and read/write would block
+ * BIO_EOF 2 out-out-bounds error
+ * BIO_SEG_VALID 3 nr_hw_seg valid
+ * BIO_CLONED 4 doesn't own data
+ * BIO_BOUNCED 5 bio is a bounce bio
+ * BIO_USER_MAPPED 6 contains user pages
+ * BIO_EOPNOTSUPP 7 not supported
+
+ * error - 0 on success
+ * rw - binary trace for read/write request
+ * vcnt - bio vector count which represents number of array element (page,
+ * offset, length) which makes up this I/O request
+ * idx - offset into the bio vector array
+ * phys_segments - number of segments in this bio after physical address
+ * coalescing is performed.
+ * size - total size in bytes
+ */
+probe ioblock_trace.end = kernel.function("bio_endio")
+{
+ devname = __bio_devname($bio)
+ ino = __bio_ino($bio)
+
+ bytes_done = $bio->bi_size
+ error = $error
+
+ sector = $bio->bi_sector
+ flags = $bio->bi_flags
+ rw = $bio->bi_rw
+ vcnt = $bio->bi_vcnt
+ idx = $bio->bi_idx
+ phys_segments = $bio->bi_phys_segments
+ size = $bio->bi_size
+}