This is the mail archive of the
systemtap@sourceware.org
mailing list for the systemtap project.
RE: [1/3] Userspace probes prototype-take2
- From: "Zhang, Yanmin" <yanmin dot zhang at intel dot com>
- To: <prasanna at in dot ibm dot com>, <systemtap at sources dot redhat dot com>
- Date: Wed, 15 Feb 2006 16:47:37 +0800
- Subject: RE: [1/3] Userspace probes prototype-take2
>>-----Original Message-----
>>From: systemtap-owner@sourceware.org [mailto:systemtap-owner@sourceware.org] On Behalf Of Prasanna S Panchamukhi
>>Sent: 2006年2月8日 22:11
>>To: systemtap@sources.redhat.com
>>Subject: [1/3] Userspace probes prototype-take2
>>
>>Here is take-2 on the user space probes prototype.
>>I have taken care of most of the comments from Yanmin Zhang.
>>Presently this patch set does not work with Config PREEMPT
>>enabled kernels. Next patch release will fix this.
>>
>>Thanks
>>Prasanna
>>+/**
>>+ * This routine looks for an existing uprobe at the given offset and inode.
>>+ * If it's found, returns the corresponding kprobe pointer.
>>+ */
>>+static struct kprobe __kprobes *get_kprobe_user(struct inode *inode,
>>+ unsigned long offset)
>>+{
>>+ struct hlist_head *head;
>>+ struct hlist_node *node;
>>+ struct kprobe *p, *kpr;
>>+ struct uprobe *uprobe;
>>+
>>+ head = &kprobe_table[hash_ptr((kprobe_opcode_t *)
>>+ (((unsigned long)inode) * offset), KPROBE_HASH_BITS)];
>>+
>>+ hlist_for_each_entry(p, node, head, hlist) {
[YM] hlist_for_each_entry should be hlist_for_each_entry_rcu?
>>+ if (kernel_text_address((unsigned long)p->addr))
>>+ continue;
>>+
>>+ if (p->pre_handler == aggr_pre_handler) {
>>+ kpr = list_entry(rcu_dereference(p)->list.next,
>>+ typeof(*kpr), list);
>>+ uprobe = container_of(kpr, struct uprobe, kp);
>>+ } else
>>+ uprobe = container_of(p, struct uprobe, kp);
>>+
>>+ if ((uprobe->inode == inode) && (uprobe->offset == offset))
>>+ return p;
>>+ }
>>+
>>+ return NULL;
>>+}
>>+
>>+/**
>>+ * Finds a uprobe at the specified user-space address in the current task.
>>+ * Points current_uprobe at that uprobe and returns the corresponding kprobe.
>>+ */
>>+static struct kprobe __kprobes *get_uprobe(void *addr)
>>+{
>>+ struct mm_struct *mm = current->mm;
>>+ struct vm_area_struct *vma;
>>+ struct inode *inode;
>>+ unsigned long offset;
>>+ struct kprobe *p, *kpr;
>>+ struct uprobe *uprobe;
>>+
>>+ down_read(&mm->mmap_sem);
>>+ vma = find_vma(mm, (unsigned long)addr);
>>+
>>+ BUG_ON(!vma); /* this should not happen, not in our memory map */
>>+
>>+ offset = (unsigned long)addr - vma->vm_start +
>>+ (vma->vm_pgoff << PAGE_SHIFT);
>>+ if (!vma->vm_file) {
>>+ up_read(&mm->mmap_sem);
>>+ return NULL;
>>+ }
>>+
>>+ inode = vma->vm_file->f_dentry->d_inode;
>>+ up_read(&mm->mmap_sem);
>>+
>>+ p = get_kprobe_user(inode, offset);
>>+ if (!p)
>>+ return NULL;
>>+
>>+ if (p->pre_handler == aggr_pre_handler) {
>>+ kpr = list_entry(rcu_dereference(p)->list.next, typeof(*kpr),
>>+ list);
>>+ uprobe = container_of(kpr, struct uprobe, kp);
>>+ } else
>>+ uprobe = container_of(p, struct uprobe, kp);
>>+
>>+ if (uprobe)
>>+ __get_cpu_var(current_uprobe) = uprobe;
>>+
>>+ return p;
>>+}
>>+
>>+/*
>>+ * This routine is called either:
>>+ * - under the kprobe_mutex - during kprobe_[un]register()
>>+ * OR
>>+ * - with preemption disabled - from arch/xxx/kernel/kprobes.c
>>+ */
>>+struct kprobe __kprobes *get_kprobe(void *addr)
>>+{
>>+ struct hlist_head *head;
>>+ struct hlist_node *node;
>>+ struct kprobe *p;
>>+
>>+ if (!kernel_text_address((unsigned long)addr))
>>+ return get_uprobe(addr);
>>+
>>+ head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
>>+ hlist_for_each_entry_rcu(p, node, head, hlist) {
>>+ if (p->addr == addr)
>>+ return p;
>>+ }
>>+ return NULL;
>>+}
>>+
>> /* Walks the list and increments nmissed count for multiprobe case */
>> void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
>> {
>>@@ -559,6 +643,342 @@ void __kprobes unregister_jprobe(struct
>> unregister_kprobe(&jp->kp);
>> }
>>
>>+typedef int (*process_uprobe_func_t)(struct uprobe *uprobe,
>>+ unsigned long *address, struct page *page);
>>+
>>+/**
>>+ * Adds the kprobe structure for the specified uprobe to either the
>>+ * kprobe_table or to the aggregate hash list for a given inode and offset.
>>+ * Also copies the instructions and inserts the breakpoint.
>>+ */
>>+int __kprobes insert_kprobe_user(struct uprobe *uprobe, unsigned long *address,
>>+ struct page *page)
>>+{
>>+ struct kprobe *old_p;
>>+ struct hlist_head *head;
>>+
>>+ uprobe->kp.nmissed = 0;
>>+ old_p = get_kprobe_user(uprobe->inode, uprobe->offset);
>>+ if (old_p)
>>+ return register_aggr_kprobe(old_p, &uprobe->kp);
>>+
>>+ head = &kprobe_table[hash_ptr((kprobe_opcode_t *)(uprobe->offset *
>>+ (unsigned long)uprobe->inode), KPROBE_HASH_BITS)];
>>+
>>+ INIT_HLIST_NODE(&uprobe->kp.hlist);
>>+ hlist_add_head_rcu(&uprobe->kp.hlist, head);
>>+
>>+ arch_copy_uprobe(&uprobe->kp, address);
>>+ arch_arm_uprobe(address);
>>+
>>+ return 0;
>>+}
>>+
>>+/**
>>+ * Wait for the page to be unlocked if someone else had locked it,
>>+ * then map the page and insert or remove the breakpoint.
>>+ */
>>+static int __kprobes map_uprobe_page(struct page *page, struct uprobe *uprobe,
>>+ process_uprobe_func_t process_kprobe_user)
>>+{
>>+ int ret = 0;
>>+ unsigned long *uprobe_address;
>>+
>>+ if (!page)
>>+ return -EINVAL; /* TODO: more suitable errno */
>>+
>>+ wait_on_page_locked(page);
It looks not appropriate to use wait_on_page_locked here. Usually, a thread locks the page
and calls readpage/readpges to deliver the I/O request to lower layer, then it calls wait_on_page_locked
to wait the I/O complete. Here, map_uprobe_page doesn't start readpage. I suggest to use lock_page directly.
>>+ /* could probably retry readpage here. */
>>+ if (!PageUptodate(page))
>>+ return -EINVAL; /* TODO: more suitable errno */
>>+
>>+ lock_page(page);
>>+
>>+ uprobe_address = kmap(page);
>>+ uprobe_address = (unsigned long *)((unsigned long)uprobe_address +
>>+ (unsigned long) (uprobe->offset & ~PAGE_MASK));
>>+ ret = (*process_kprobe_user)(uprobe, uprobe_address, page);
>>+ kunmap(page);
>>+
>>+ unlock_page(page);
>>+
>>+ return ret;
>>+}
>>+
>>+/**
>>+ * flush_vma walks through the list of process private mappings,
>>+ * gets the vma containing the offset and flush all the vma's
>>+ * containing the probed page.
>>+ */
>>+static void __kprobes flush_vma(struct address_space *mapping,
>>+ struct page *page, struct uprobe *uprobe)
>>+{
>>+ struct vm_area_struct *vma = NULL;
>>+ struct prio_tree_iter iter;
>>+ struct prio_tree_root *head = &mapping->i_mmap;
>>+ struct mm_struct *mm;
>>+ unsigned long start, end, offset = uprobe->offset;
>>+
>>+ vma_prio_tree_foreach(vma, &iter, head, offset, offset) {
>>+ mm = vma->vm_mm;
>>+ down_read(&mm->mmap_sem);
>>+ spin_lock(&mapping->i_mmap_lock);
>>+
>>+
>>+ spin_lock(&mm->page_table_lock);
Locks here are confusing. I think mapping->i_mmap_lock is enough. Call spin_lock(&mm->page_table_lock)
before vma_prio_tree_foreach and spin_unlock out of vma_prio_tree_foreach.
>>+ start = vma->vm_start - (vma->vm_pgoff << PAGE_SHIFT);
>>+ end = vma->vm_end - (vma->vm_pgoff << PAGE_SHIFT);
>>+ spin_unlock(&mm->page_table_lock);
>>+
>>+ if ((start + offset) < end)
>>+ flush_icache_user_range(vma, page,
>>+ (unsigned long)uprobe->kp.addr,
>>+ sizeof(kprobe_opcode_t));
>>+ spin_unlock(&mapping->i_mmap_lock);
>>+ up_read(&mm->mmap_sem);
>>+ }
>>+}
>>+
>>+/**
>>+ * Check if the given offset lies within the given page range.
>>+ */
>>+static inline int find_page_probe(unsigned long offset,
>>+ unsigned long page_start)
>>+{
Suggest to delete this function. The caller could use:
If (offset & PAGE_MASK == page_start & PAGE_MASK)
>>+ unsigned long page_end = page_start + PAGE_SIZE;
>>+
>>+ if ((offset >= page_start) && (offset < page_end))
>>+ return 1;
>>+
>>+ return 0;
>>+}
>>+
>>+/**
>>+ * Walk the uprobe_module_list and return the uprobe module with matching
>>+ * inode.
>>+ */
>>+static struct uprobe_module __kprobes *get_module_by_inode(struct inode *inode)
>>+{
>>+ struct uprobe_module *umodule;
>>+
>>+ list_for_each_entry(umodule, &uprobe_module_list, mlist) {
>>+ if (umodule->nd.dentry->d_inode == inode)
>>+ return umodule;
>>+ }
>>+
>>+ return NULL;
>>+}
>>+
>>+/**
>>+ * Gets exclusive write access to the given inode to ensure that the file
>>+ * on which probes are currently applied does not change. Use the function,
>>+ * deny_write_access_to_inode() we added in fs/namei.c.
>>+ */
>>+static inline int ex_write_lock(struct inode *inode)
>>+{
>>+ return deny_write_access_to_inode(inode);
>>+}
>>+
>>+/**
>>+ * Called when removing user space probes to release the write lock on the
>>+ * inode.
>>+ */
>>+static inline int ex_write_unlock(struct inode *inode)
>>+{
>>+ atomic_inc(&inode->i_writecount);
>>+ return 0;
>>+}
>>+
>>+/**
>>+ * Get the inode operations. This function leaves with the dentry held
>>+ * and taking with the inode writelock held to ensure that the file on
>>+ * which probes are currently active does not change from under us. Add uprobe
>>+ * and uprobe_module to the appropriate hash list. Also swithces i_op to
>>+ * hooks into readpage and readpages().
>>+ */
>>+static void __kprobes get_inode_ops(struct uprobe *uprobe,
>>+ struct uprobe_module *umodule)
>>+{
>>+ INIT_HLIST_HEAD(&umodule->ulist_head);
>>+ hlist_add_head(&uprobe->ulist, &umodule->ulist_head);
>>+ list_add(&umodule->mlist, &uprobe_module_list);
>>+}
>>+
>>+int __kprobes remove_kprobe_user(struct uprobe *uprobe, unsigned long *address,
>>+ struct page *page)
>>+{
>>+ struct kprobe *old_p, *list_p, *p;
>>+ int cleanup_p;
>>+
>>+ p = &uprobe->kp;
>>+ mutex_lock(&kprobe_mutex);
>>+ old_p = get_kprobe_user(uprobe->inode, uprobe->offset);
>>+ if (unlikely(!old_p)) {
>>+ mutex_unlock(&kprobe_mutex);
>>+ return 0;
>>+ }
>>+
>>+ if (p != old_p) {
>>+ list_for_each_entry_rcu(list_p, &old_p->list, list)
>>+ if (list_p == p)
>>+ /* kprobe p is a valid probe */
>>+ goto valid_p;
>>+ mutex_unlock(&kprobe_mutex);
>>+ return 0;
>>+ }
>>+
>>+valid_p:
>>+ if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) &&
>>+ (p->list.next == &old_p->list) &&
>>+ (p->list.prev == &old_p->list))) {
>>+ /* Only probe on the hash list */
>>+ arch_disarm_uprobe(p, (kprobe_opcode_t *)address);
>>+ hlist_del_rcu(&old_p->hlist);
>>+ cleanup_p = 1;
>>+ } else {
>>+ list_del_rcu(&p->list);
>>+ cleanup_p = 0;
>>+ }
>>+
>>+ mutex_unlock(&kprobe_mutex);
>>+
>>+ synchronize_sched();
>>+ if (cleanup_p) {
>>+ if (p != old_p) {
>>+ list_del_rcu(&p->list);
>>+ kfree(old_p);
>>+ }
>>+ }
>>+
>>+ return 0;
>>+}
>>+
>>+/**
>>+ * unregister_uprobe: Disarms the probe, removes the kprobe and uprobe
>>+ * pointers from the hash lists. Unhooks readpage routines.
>>+ */
>>+void __kprobes unregister_uprobe(struct uprobe *uprobe)
>>+{
>>+ struct address_space *mapping;
>>+ struct uprobe_module *umodule;
>>+ struct page *page;
>>+ int ret = 0;
>>+
>>+ if (!uprobe->inode)
>>+ return;
>>+
>>+ mapping = uprobe->inode->i_mapping;
>>+
>>+ page = find_get_page(mapping, uprobe->offset >> PAGE_CACHE_SHIFT);
[YM] It's better to check if(page==NULL) here. If page==NULL when the page was deleted, below
map_uprobe_page and flush_vma should be bypassed.
>>+
>>+ ret = map_uprobe_page(page, uprobe, remove_kprobe_user);
>>+ /*
>>+ * TODO: unregister_uprobe should not fail, need to handle if it fails.
>>+ */
>>+ flush_vma(mapping, page, uprobe);
>>+
>>+ if (page)
>>+ page_cache_release(page);
>>+
>>+ mutex_lock(&uprobe_mutex);
>>+ if (!(umodule = get_module_by_inode(uprobe->inode)))
>>+ goto out;
>>+
>>+ hlist_del(&uprobe->ulist);
>>+ if (hlist_empty(&umodule->ulist_head)) {
>>+ list_del(&umodule->mlist);
>>+ ex_write_unlock(uprobe->inode);
>>+ path_release(&umodule->nd);
>>+ kfree(umodule);
>>+ }
>>+out:
>>+ mutex_unlock(&uprobe_mutex);
>>+}
>>+
>>+/**
>>+ * register_uprobe(): combination of inode and offset is used to identify each
>>+ * probe uniquely. Each uprobe can be found from the kprobes_hash table by
>>+ * using inode and offset. register_uprobe(), inserts the breakpoint at the
>>+ * given address by locating and mapping the page. return 0 on success and
>>+ * error on failure.
>>+ */
>>+int __kprobes register_uprobe(struct uprobe *uprobe)
>>+{
>>+ struct address_space *mapping;
>>+ struct uprobe_module *umodule = NULL;
>>+ struct inode *inode;
>>+ struct nameidata nd;
>>+ struct page *page;
>>+ int error = 0;
>>+
>>+ INIT_HLIST_NODE(&uprobe->ulist);
>>+
>>+ /*
>>+ * TODO: Need to calculate the absolute file offset for dynamic
>>+ * shared libraries.
>>+ * uprobe->offset = (unsigned long)uprobe->kp.addr & UPROBE_OFFSET_MASK;
>>+ */
>>+ if ((error = path_lookup(uprobe->pathname, LOOKUP_FOLLOW, &nd)))
>>+ return error;
>>+
>>+ inode = nd.dentry->d_inode;
>>+
>>+ error = ex_write_lock(inode);
>>+ if (error) {
>>+ path_release(&nd);
>>+ goto out;
>>+ }
>>+
>>+ mutex_lock(&uprobe_mutex);
>>+ /*
>>+ * Check if there are probes already on this application and add the
>>+ * corresponding uprobe to per application probe's list.
>>+ */
>>+ umodule = get_module_by_inode(inode);
>>+ if (!umodule) {
>>+ /*
>>+ * Allocate a uprobe_module structure for this application
>>+ * if not allocated before.
>>+ */
>>+ umodule = kzalloc(sizeof(struct uprobe_module), GFP_KERNEL);
>>+ if (!umodule) {
>>+ error = -ENOMEM;
>>+ path_release(&nd);
>>+ ex_write_unlock(inode);
>>+ goto out;
>>+ }
>>+ memcpy(&umodule->nd, &nd, sizeof(struct nameidata));
>>+ get_inode_ops(uprobe, umodule);
>>+ } else {
>>+ ex_write_unlock(inode);
>>+ hlist_add_head(&uprobe->ulist, &umodule->ulist_head);
>>+ }
>>+
>>+ uprobe->inode = inode;
>>+ mapping = inode->i_mapping;
>>+ mutex_lock(&kprobe_mutex);
>>+ page = find_get_page(mapping, (uprobe->offset >> PAGE_CACHE_SHIFT));
[YM] The same thing like unregister_uprobe.
>>+
>>+ /*
>>+ * If error == -EINVAL, return success, probes will inserted by
>>+ * readpage hooks.
>>+ * TODO: Use a more suitable errno?
>>+ */
>>+ error = map_uprobe_page(page, uprobe, insert_kprobe_user);
>>+ if (error == -EINVAL)
>>+ error = 0;
>>+ flush_vma(mapping, page, uprobe);
>>+
>>+ if (page)
>>+ page_cache_release(page);
>>+
>>+ mutex_unlock(&kprobe_mutex);
>>+out:
>>+ mutex_unlock(&uprobe_mutex);
>>+
>>+ return error;
>>+}
>>+
>> #ifdef ARCH_SUPPORTS_KRETPROBES
>>
>> /*
>>@@ -650,6 +1070,8 @@ static int __init init_kprobes(void)
>> INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
>> }
>>
>>+ /* initialize uprobe_module_list */
>>+ INIT_LIST_HEAD(&uprobe_module_list);
>> err = arch_init_kprobes();
>> if (!err)
>> err = register_die_notifier(&kprobe_exceptions_nb);
>>@@ -666,4 +1088,5 @@ EXPORT_SYMBOL_GPL(unregister_jprobe);
>> EXPORT_SYMBOL_GPL(jprobe_return);
>> EXPORT_SYMBOL_GPL(register_kretprobe);
>> EXPORT_SYMBOL_GPL(unregister_kretprobe);
>>-
>>+EXPORT_SYMBOL_GPL(register_uprobe);
>>+EXPORT_SYMBOL_GPL(unregister_uprobe);
>>
>>_
>>--
>>Prasanna S Panchamukhi
>>Linux Technology Center
>>India Software Labs, IBM Bangalore
>>Email: prasanna@in.ibm.com
>>Ph: 91-80-51776329