This is the mail archive of the systemtap@sources.redhat.com mailing list for the systemtap project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: x86_64 RIP-relative addressing bug


Sorry I didn't answer all the particular points you folks raised on
Wednesday.  I grew weary of the protracted discussion, and instead spent a
couple of hours writing the instruction analyzer function and testing it.

The function in a standalone C file is attached first.  Next is a C program
to drive test data through it.  The third attachment is a shell script that
massages objdump output to funnel instruction data and the correct results
(according to objdump's printing of %rip) to that program.  Do:

	gcc -O -g -o ripcheck ripcheck.c
	./riptest.sh vmlinux another-vmlinux more-vmlinux... /blah/*.ko

That runs objdump on all the binaries listed, and checks the detector
function's results on every instruction in the disassembly.  I ran the
rawhide vmlinux (2.6.10-1.1155_FC4smp) and my hand-built kernels through it
(I didn't actually try any .ko files), and it reported all correct answers
from the detector.

Then I went to the Pacific Film Archives for a screening of Tron.
That put me in the mood not to take any guff from uppity computers.

So, I plopped the detector function and the displacement fixup into the
kprobes code.  As I mentioned, this can't work with the instruction copy
pages located in the normal vmalloc region, which is statically located too
far away from kernel and module text.  So I hacked kprobes to do most of
the vmalloc guts directly, but using a chunk of address space immediately
above the module text area, where there is a goodly chunk left inside the
2GB distance from the base of the kernel text.  The result is like a
vmalloc, but doesn't waste a page after every allocated instruction page
like it was doing before (vanilla vmalloc always eats a following guard
page), and uses the appropriately close part of the address space, disjoint
from the vanilla vmalloc allocation area.

The fourth attachment is the kernel patch.  The final attachment is a
trivial kernel module that inserts a probe on the address you give with
"insmod testmodule probeaddr=0x...".  I inserted the probe at some spots
where %rip-relative addressing is used (and where it would have gone wild
immediately if the load was from the wrong address), and verified that it
was hit numerous times.  Seems to work.

If this all looks OK to everyone, I'll add some more comments to the kernel
code and submit the patch upstream.


Thanks,
Roland


#include <stdint.h>
#define s32 int32_t
#define u8 uint8_t

/* stolen from i386-dis.c */
static const unsigned char onebyte_has_modrm[256] = {
  /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
  /*       -------------------------------        */
  /* 00 */ 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0, /* 00 */
  /* 10 */ 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0, /* 10 */
  /* 20 */ 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0, /* 20 */
  /* 30 */ 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0, /* 30 */
  /* 40 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 40 */
  /* 50 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 50 */
  /* 60 */ 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0, /* 60 */
  /* 70 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 70 */
  /* 80 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 80 */
  /* 90 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 90 */
  /* a0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* a0 */
  /* b0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* b0 */
  /* c0 */ 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0, /* c0 */
  /* d0 */ 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1, /* d0 */
  /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* e0 */
  /* f0 */ 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1  /* f0 */
  /*       -------------------------------        */
  /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
};

static const unsigned char twobyte_has_modrm[256] = {
  /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
  /*       -------------------------------        */
  /* 00 */ 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1, /* 0f */
  /* 10 */ 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0, /* 1f */
  /* 20 */ 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1, /* 2f */
  /* 30 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 3f */
  /* 40 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 4f */
  /* 50 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 5f */
  /* 60 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 6f */
  /* 70 */ 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1, /* 7f */
  /* 80 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 8f */
  /* 90 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 9f */
  /* a0 */ 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1, /* af */
  /* b0 */ 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1, /* bf */
  /* c0 */ 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0, /* cf */
  /* d0 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* df */
  /* e0 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ef */
  /* f0 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0  /* ff */
  /*       -------------------------------        */
  /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
};

static s32 *is_riprel(u8 *insn)
{
  int need_modrm;
  u8 modrm;

  /* Skip legacy prefixes.  */
  while (1) {
    switch (*insn) {
    case 0x66:
    case 0x67:
    case 0x2e:
    case 0x3e:
    case 0x26:
    case 0x64:
    case 0x65:
    case 0x36:
    case 0xf0:
    case 0xf3:
    case 0xf2:
      ++insn;
      continue;
    }
    break;
  }

  /* Skip REX prefix.  */
  if ((*insn & 0xf0) == 0x40)
    ++insn;

  if (*insn == 0x0f)		/* Two-byte opcode.  */
    need_modrm = twobyte_has_modrm[*++insn];
  else				/* One-byte opcode.  */
    need_modrm = onebyte_has_modrm[*insn];

  if (!need_modrm)
    return NULL;

  modrm = *++insn;
  if ((modrm & 0xc7) != 0x05)
    return NULL;

  return (s32 *) ++insn;	/* Displacement follows SIB byte.  */
}
#define _GNU_SOURCE
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>

struct insn {
  uint64_t pc;
  size_t nbytes;
  char bytes[15];
  uint32_t disp32;
  bool riprel;
};


static int32_t *is_riprel(uint8_t *insnbytes);

int main(void)
{
  size_t lose = 0;
  size_t nlines = 0;

  char *line = NULL;
  size_t len = 0;
  while (getline (&line, &len, stdin) > 0)
    {
      char *p = line;
      struct insn ti;
      size_t i;
      size_t n;
      bool riprel;
      const uint32_t *ripdisp;

      ++nlines;

      ti.pc = strtoull (p, &p, 16);
      switch (*p++) {
      case '+':
	ti.riprel = true;
	ti.disp32 = strtoul (p, &p, 0);
	if (*p++ != ',')
	  goto badinput;
	break;
      case '-':
	ti.riprel = false;
	break;
      default:
	goto badinput;
      }
      ti.nbytes = strtoul (p, &p, 0);
      if (*p != ',') {
      badinput:
	fprintf(stderr, "malformed input line %u: %s\n", nlines, line);
	return 1;
      }
      for (i = 0; i < 16; ++i) {
	ti.bytes[i] = strtoul(p + 1, &p, 16);
	if (*p != ',')
	  break;
      }
      if (*p != '\n' || i+1 != ti.nbytes)
	goto badinput;

      if (nlines % 1000 == 0) {
	printf("\r%u\r", nlines);
	fflush(stdout);
      }

      ripdisp = is_riprel(ti.bytes);
      riprel = ripdisp != NULL;
      if (!riprel != !ti.riprel)
	{
	  printf ("PC %#llx: false %stive\n",
		  ti.pc, riprel ? "posi" : "nega");
	  ++lose;
	}
      if (riprel && (const char *) (ripdisp + 1) - ti.bytes > ti.nbytes)
	{
	  printf ("PC %#llx: scanned size > %u\n",
		  ti.pc, ti.nbytes);
	  ++lose;
	}
      else if (riprel && *ripdisp != ti.disp32)
	{
	  printf ("PC %#llx: displacement (byte %u) %#x != %#x\n",
		  ti.pc, (const char *) ripdisp - ti.bytes,
		  *ripdisp, ti.disp32);
	  ++lose;
	}

      if (lose > 50)
	return 1;
    }

  printf ("tested %u instructions, all good!\n", nlines);

  return 0;
}

#include "riprel.c"
#!/bin/sh

objdump -dw "$@" | awk '
! /^[0-9a-f]+:/ { next }
/\(bad\)/ { next }
$NF == "data16" && $(NF-1) == "66" { next }

{
  sub(/:/, "", $1);
  pc = "0x" $1;
  bytes = "";
  nbytes = 0;
  for (i = 2; i < NF; ++i) {
    if ($i !~ /^[0-9a-f][0-9a-f]$/) break;
    bytes = bytes "," $i;
    ++nbytes;
  }

  riprel = 0;
  if (/\(%rip\)/) {
    while (i <= NF) {
      if ($i ~ /%rip/) {
	riprel = 1;
	ofs = gensub(/^.*[, *]([-0-9]+)\(%rip\).*$/, "\\1", $i);
	break;
      }
      ++i;
    }
    if (!riprel) exit 99;
    print pc "+" ofs "," nbytes bytes;
  }
  else {
    print pc "-" nbytes bytes;
  }
}
' | `dirname $0`/ripcheck
--- linux-2.6/arch/x86_64/kernel/kprobes.c
+++ linux-2.6/arch/x86_64/kernel/kprobes.c
@@ -86,9 +86,105 @@ int arch_prepare_kprobe(struct kprobe *p
 	return 0;
 }
 
+/* stolen from i386-dis.c */
+static const unsigned char onebyte_has_modrm[256] = {
+  /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
+  /*       -------------------------------        */
+  /* 00 */ 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0, /* 00 */
+  /* 10 */ 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0, /* 10 */
+  /* 20 */ 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0, /* 20 */
+  /* 30 */ 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0, /* 30 */
+  /* 40 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 40 */
+  /* 50 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 50 */
+  /* 60 */ 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0, /* 60 */
+  /* 70 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 70 */
+  /* 80 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 80 */
+  /* 90 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 90 */
+  /* a0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* a0 */
+  /* b0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* b0 */
+  /* c0 */ 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0, /* c0 */
+  /* d0 */ 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1, /* d0 */
+  /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* e0 */
+  /* f0 */ 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1  /* f0 */
+  /*       -------------------------------        */
+  /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
+};
+
+static const unsigned char twobyte_has_modrm[256] = {
+  /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
+  /*       -------------------------------        */
+  /* 00 */ 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1, /* 0f */
+  /* 10 */ 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0, /* 1f */
+  /* 20 */ 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1, /* 2f */
+  /* 30 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 3f */
+  /* 40 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 4f */
+  /* 50 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 5f */
+  /* 60 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 6f */
+  /* 70 */ 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1, /* 7f */
+  /* 80 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 8f */
+  /* 90 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 9f */
+  /* a0 */ 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1, /* af */
+  /* b0 */ 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1, /* bf */
+  /* c0 */ 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0, /* cf */
+  /* d0 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* df */
+  /* e0 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ef */
+  /* f0 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0  /* ff */
+  /*       -------------------------------        */
+  /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
+};
+
+static s32 *is_riprel(u8 *insn)
+{
+  int need_modrm;
+  u8 modrm;
+
+  /* Skip legacy prefixes.  */
+  while (1) {
+    switch (*insn) {
+    case 0x66:
+    case 0x67:
+    case 0x2e:
+    case 0x3e:
+    case 0x26:
+    case 0x64:
+    case 0x65:
+    case 0x36:
+    case 0xf0:
+    case 0xf3:
+    case 0xf2:
+      ++insn;
+      continue;
+    }
+    break;
+  }
+
+  /* Skip REX prefix.  */
+  if ((*insn & 0xf0) == 0x40)
+    ++insn;
+
+  if (*insn == 0x0f)		/* Two-byte opcode.  */
+    need_modrm = twobyte_has_modrm[*++insn];
+  else				/* One-byte opcode.  */
+    need_modrm = onebyte_has_modrm[*insn];
+
+  if (!need_modrm)
+    return NULL;
+
+  modrm = *++insn;
+  if ((modrm & 0xc7) != 0x05)
+    return NULL;
+
+  return (s32 *) ++insn;	/* Displacement follows SIB byte.  */
+}
+
 void arch_copy_kprobe(struct kprobe *p)
 {
+	s32 *ripdisp;
 	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
+	ripdisp = is_riprel(p->ainsn.insn);
+	if (ripdisp) {
+		*ripdisp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn;
+	}
 }
 
 void arch_remove_kprobe(struct kprobe *p)
@@ -417,6 +513,8 @@ static kprobe_opcode_t *get_insn_slot(vo
 {
 	struct kprobe_insn_page *kip;
 	struct hlist_node *pos;
+	struct vm_struct *area;
+	struct page **pages;
 
 	hlist_for_each(pos, &kprobe_insn_pages) {
 		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
@@ -439,12 +537,40 @@ static kprobe_opcode_t *get_insn_slot(vo
 	if (!kip) {
 		return NULL;
 	}
-	kip->insns = (kprobe_opcode_t*) __vmalloc(PAGE_SIZE,
-		GFP_KERNEL|__GFP_HIGHMEM, __pgprot(__PAGE_KERNEL_EXEC));
-	if (!kip->insns) {
+
+	/*
+	 * This basically replicates __vmalloc, except that it uses
+	 * a range of addresses starting at MODULE_END.  This also
+	 * allocates a single page of address space with no following
+	 * guard page.  We set up all the data structures here such
+	 * that a normal vfree call tears them all down just right.
+	 */
+	area = __get_vm_area(0, VM_ALLOC,
+			     MODULES_END, MODULES_END + MODULES_LEN);
+	if (!area)
+		goto fail_kip;
+	area->nr_pages = 1;
+	area->pages = kmalloc(sizeof(struct page *), GFP_KERNEL);
+	if (!area->pages)
+		goto fail_area;
+	area->pages[0] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
+	if (!area->pages[0])
+		goto fail_pages;
+	pages = area->pages;
+	if (map_vm_area(area, PAGE_KERNEL_EXEC, &pages)) {
+		__free_page(area->pages[0]);
+	fail_pages:
+		kfree(area->pages);
+	fail_area:
+		remove_vm_area(area->addr);
+		kfree(area);
+	fail_kip:
 		kfree(kip);
 		return NULL;
 	}
+	BUG_ON(pages != area->pages + 1);
+	kip->insns = (kprobe_opcode_t *) area->addr;
+
 	INIT_HLIST_NODE(&kip->hlist);
 	hlist_add_head(&kip->hlist, &kprobe_insn_pages);
 	memset(kip->slot_used, 0, INSNS_PER_PAGE);
#include <linux/kprobes.h>
#include <linux/module.h>

static int hit_pre, hit_post;

int test_probe_pre(struct kprobe *kp, struct pt_regs *regs)
{
  ++hit_pre;
  return 0;
}

void test_probe_post(struct kprobe *kp,
		     struct pt_regs *regs, unsigned long flags)
{
  ++hit_post;
}

static struct kprobe test_probe = {
  .pre_handler = test_probe_pre,
  .post_handler = test_probe_post,
};


MODULE_DESCRIPTION("trivial test probe module");
MODULE_LICENSE("GPL");

static unsigned long probeaddr;
module_param(probeaddr, ulong, 0);
MODULE_PARM_DESC(probeaddr,
		 "\nKernel instruction at which to insert the probe.\n");

static void dumpinsn(char *tag, unsigned char *pc)
{
  int i;
  printk("\n*** %s %lx\n\t", tag, (unsigned long)pc);
  for (i = 0; i < 15; ++i)
    printk (" %02x", pc[i]);
  printk (" .\n");
}

static int __init init_testprobe(void)
{
  int rc;
  if (probeaddr == 0) {
    printk("Test probe module requires probeaddr=ADDRESS parameter.\n");
    return -EINVAL;
  }
  test_probe.addr = (kprobe_opcode_t *) probeaddr;
  dumpinsn("before", test_probe.addr);
  rc = register_kprobe(&test_probe);
  if (rc == 0) {
    dumpinsn("after", test_probe.addr);
    dumpinsn("copy", test_probe.ainsn.insn);
  }
  return rc;
}

static void __exit exit_testprobe(void)
{
  unregister_kprobe(&test_probe);
  printk("Test probe at %lx: %d pre hits, %d post hits\n",
	 probeaddr, hit_pre, hit_post);
}

module_init(init_testprobe);
module_exit(exit_testprobe);

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]