This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH 1/3] Refactor trampoline code.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Sat, 19 Oct 2013 10:24:12 +0200
- Subject: [PATCH 1/3] Refactor trampoline code.
- Authentication-results: sourceware.org; auth=none
Hi, I returned to storing floating point registers. A first step is
refactor code so it can be factored out in second patch and sse saving
logic in third patch. A code now is inconsistent in several ways:
_dl_runtime_resolve does not save r10 and r11 but other do.
_dl_runtime_resolve does not clober rax but others do. This could be
problem with variadic calls. A relevant part from abi is:
"
Note that %r11 is neither required to be preserved, nor is it used to pass arguments.
Making this register available as scratch register means that code in the PLT
need not spill any registers when computing the address to which control needs to be transferred.
%rax is used to indicate the number of vector arguments passed to a function requiring a variable
number of arguments. %r10 is used for passing a functionâs static chain pointer
"
Could somebody clarify? How does r10 static chain pointer part apply?
Following passes tests and should work if we assume that r10 and r11 are scratch registers.
* sysdeps/x86_64/dl-tlsdesc.S: Refactor trampolines.
* sysdeps/x86_64/dl-trampoline.S: Likewise.
---
sysdeps/x86_64/dl-tlsdesc.S | 53 ++++++++++++++++------------------------
sysdeps/x86_64/dl-trampoline.S | 8 ++++--
2 files changed, 27 insertions(+), 34 deletions(-)
diff --git a/sysdeps/x86_64/dl-tlsdesc.S b/sysdeps/x86_64/dl-tlsdesc.S
index de5219a..c439c7e 100644
--- a/sysdeps/x86_64/dl-tlsdesc.S
+++ b/sysdeps/x86_64/dl-tlsdesc.S
@@ -98,43 +98,37 @@ _dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
_dl_tlsdesc_dynamic:
/* Preserve call-clobbered registers that we modify.
We need two scratch regs anyway. */
- movq %rsi, -16(%rsp)
- movq %fs:DTV_OFFSET, %rsi
- movq %rdi, -8(%rsp)
- movq TLSDESC_ARG(%rax), %rdi
- movq (%rsi), %rax
- cmpq %rax, TLSDESC_GEN_COUNT(%rdi)
+ movq %fs:DTV_OFFSET, %r10
+ movq TLSDESC_ARG(%rax), %r11
+ movq (%r10), %rax
+ cmpq %rax, TLSDESC_GEN_COUNT(%r11)
ja .Lslow
- movq TLSDESC_MODID(%rdi), %rax
+ movq TLSDESC_MODID(%r11), %rax
salq $4, %rax
- movq (%rax,%rsi), %rax
+ movq (%rax,%r10), %rax
cmpq $-1, %rax
je .Lslow
- addq TLSDESC_MODOFF(%rdi), %rax
+ addq TLSDESC_MODOFF(%r11), %rax
.Lret:
- movq -16(%rsp), %rsi
subq %fs:0, %rax
- movq -8(%rsp), %rdi
ret
.Lslow:
/* Besides rdi and rsi, saved above, save rdx, rcx, r8, r9,
- r10 and r11. Also, align the stack, that's off by 8 bytes. */
+ Also, align the stack, that's off by 8 bytes. */
subq $72, %rsp
cfi_adjust_cfa_offset (72)
movq %rdx, 8(%rsp)
movq %rcx, 16(%rsp)
movq %r8, 24(%rsp)
movq %r9, 32(%rsp)
- movq %r10, 40(%rsp)
- movq %r11, 48(%rsp)
- /* %rdi already points to the tlsinfo data structure. */
+
+ movq %r11, %rdi
call __tls_get_addr@PLT
+
movq 8(%rsp), %rdx
movq 16(%rsp), %rcx
movq 24(%rsp), %r8
movq 32(%rsp), %r9
- movq 40(%rsp), %r10
- movq 48(%rsp), %r11
addq $72, %rsp
cfi_adjust_cfa_offset (-72)
jmp .Lret
@@ -164,27 +158,26 @@ _dl_tlsdesc_dynamic:
_dl_tlsdesc_resolve_rela:
cfi_adjust_cfa_offset (8)
/* Save all call-clobbered registers. */
+ movq (%rsp), %r11
subq $72, %rsp
cfi_adjust_cfa_offset (72)
movq %rax, (%rsp)
movq %rdi, 8(%rsp)
- movq %rax, %rdi /* Pass tlsdesc* in %rdi. */
movq %rsi, 16(%rsp)
- movq 72(%rsp), %rsi /* Pass link_map* in %rsi. */
movq %r8, 24(%rsp)
movq %r9, 32(%rsp)
- movq %r10, 40(%rsp)
- movq %r11, 48(%rsp)
movq %rdx, 56(%rsp)
movq %rcx, 64(%rsp)
+
+ movq %rax, %rdi /* Pass tlsdesc* in %rdi. */
+ movq %r11, %rsi /* Pass link_map* in %rsi. */
call _dl_tlsdesc_resolve_rela_fixup
+
movq (%rsp), %rax
movq 8(%rsp), %rdi
movq 16(%rsp), %rsi
movq 24(%rsp), %r8
movq 32(%rsp), %r9
- movq 40(%rsp), %r10
- movq 48(%rsp), %r11
movq 56(%rsp), %rdx
movq 64(%rsp), %rcx
addq $80, %rsp
@@ -210,35 +203,31 @@ _dl_tlsdesc_resolve_rela:
cfi_startproc
.align 16
_dl_tlsdesc_resolve_hold:
-0:
/* Save all call-clobbered registers. */
subq $72, %rsp
cfi_adjust_cfa_offset (72)
movq %rax, (%rsp)
movq %rdi, 8(%rsp)
- movq %rax, %rdi /* Pass tlsdesc* in %rdi. */
movq %rsi, 16(%rsp)
/* Pass _dl_tlsdesc_resolve_hold's address in %rsi. */
- leaq . - _dl_tlsdesc_resolve_hold(%rip), %rsi
movq %r8, 24(%rsp)
movq %r9, 32(%rsp)
- movq %r10, 40(%rsp)
- movq %r11, 48(%rsp)
movq %rdx, 56(%rsp)
movq %rcx, 64(%rsp)
+
+ movq %rax, %rdi /* Pass tlsdesc* in %rdi. */
+ leaq . - _dl_tlsdesc_resolve_hold(%rip), %rsi
call _dl_tlsdesc_resolve_hold_fixup
-1:
+
movq (%rsp), %rax
movq 8(%rsp), %rdi
movq 16(%rsp), %rsi
movq 24(%rsp), %r8
movq 32(%rsp), %r9
- movq 40(%rsp), %r10
- movq 48(%rsp), %r11
movq 56(%rsp), %rdx
movq 64(%rsp), %rcx
addq $72, %rsp
cfi_adjust_cfa_offset (-72)
- jmp *(%eax)
+ jmp *(%rax)
cfi_endproc
.size _dl_tlsdesc_resolve_hold, .-_dl_tlsdesc_resolve_hold
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index a25e390..4212145 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -30,6 +30,8 @@
.align 16
cfi_startproc
_dl_runtime_resolve:
+ movq (%rsp), %r10
+ movq 8(%rsp), %r11
cfi_adjust_cfa_offset(16) # Incorporate PLT
subq $56,%rsp
cfi_adjust_cfa_offset(56)
@@ -40,10 +42,12 @@ _dl_runtime_resolve:
movq %rdi, 32(%rsp)
movq %r8, 40(%rsp)
movq %r9, 48(%rsp)
- movq 64(%rsp), %rsi # Copy args pushed by PLT in register.
- movq 56(%rsp), %rdi # %rdi: link_map, %rsi: reloc_index
+
+ movq %r11, %rsi # Copy args pushed by PLT in register.
+ movq %r10, %rdi # %rdi: link_map, %rsi: reloc_index
call _dl_fixup # Call resolver.
movq %rax, %r11 # Save return value
+
movq 48(%rsp), %r9 # Get register content back.
movq 40(%rsp), %r8
movq 32(%rsp), %rdi
--
1.7.10.4