Hi Solar,
To satisfy your curiousity (and also because I like beer
), here's the full output with sources and disassembly. I've restricted the code to the relevant parts.
Sources:
Code:
x86_64/start.S:
_start:
cli
cld
/* ...cpu initialization, lgdt etc. irrelevant, so removed... */
/* jump to C function main() in 64 bit code segment */
xchg %bx, %bx
pushq $0x08
pushq $main
lretq
Code:
main.c:
#include <core.h>
void main()
{
lang_init();
while(1);
}
Code:
lang.c:
#include <core.h>
#include "lang.h"
void lang_init()
{
char fn[]="/sys/lang/core.\0\0\0\0\0";
char *s,*e,*a;
int i=0,l,k;
__asm__ __volatile__("xchg %bx, %bx"); <--- NOTE: there's no code in this function above this line
...
/* the rest of this function is irrelevant as it never gets executed. Nothing non ANSI C btw, no inline assembly. */
Please note the ottermost important thing:
there's nothing in the C code which could modify or mess up the stack pointer.Compilation with full output (I haven't removed anything, no warnings or errors of any kind):
Code:
$ x86_64-elf-gcc -D_AS=1 -DDEBUG=1 -DOPTIMIZE=0 -ansi -Wall -Wextra -Wpedantic -O2 -fpic -ffreestanding -nostdinc -nostdlib -fno-stack-protector -I../../../include -I./ibmpc -mno-red-zone -c start.S -o start.o
$ x86_64-elf-gcc -DDEBUG=1 -DOPTIMIZE=0 -D_OSZ_CORE_=1 -D__x86_64__ -D__ibmpc__ -fpic -fno-stack-protector -fno-builtin -nostdlib -nostdinc -I. -I./x86_64 -I./x86_64/ibmpc -I../../include -ansi -Wall -Wextra -Wpedantic -ffreestanding -O2 -fno-delete-null-pointer-checks -fno-stack-protector -mno-red-zone -c main.c -o main.o
$ x86_64-elf-gcc -DDEBUG=1 -DOPTIMIZE=0 -D_OSZ_CORE_=1 -D__x86_64__ -D__ibmpc__ -fpic -fno-stack-protector -fno-builtin -nostdlib -nostdinc -I. -I./x86_64 -I./x86_64/ibmpc -I../../include -ansi -Wall -Wextra -Wpedantic -ffreestanding -O2 -fno-delete-null-pointer-checks -fno-stack-protector -mno-red-zone -c lang.c -o lang.o
GCC version: 8.2.1 20180831
Bochs output:
Code:
01991983883i[CPU0 ] [1991983883] Stopped on MAGIC BREAKPOINT
(0) Magic breakpoint
Next at t=1991983883
(0) [0x0000001380dc] 0008:ffffffffffe020dc (_start+5c): push 0x0000000000000008 ; 6a08
<bochs:4> s
Next at t=1991983884
(0) [0x0000001380de] 0008:ffffffffffe020de (_start+5e): push 0xffffffffffe05d20 ; 68205de0ff
<bochs:5> s
Next at t=1991983885
(0) [0x0000001380e3] 0008:ffffffffffe020e3 (_start+63): retf ; 48cb
<bochs:6> print-stack
Stack address size 8
| STACK 0xffffffffffffffe0 [0xffffffff:0xffe05d20] <--- NOTE: RSP is properly aligned
| STACK 0xffffffffffffffe8 [0x00000000:0x00000008]
...
<bochs:7> c
01991983900e[CPU0 ] write_linear_xmmword_aligned(): #GP misaligned access
01991983900e[CPU0 ] interrupt(long mode): IDT entry extended attributes DWORD4 TYPE != 0
01991983900e[CPU0 ] interrupt(long mode): IDT entry extended attributes DWORD4 TYPE != 0
01991983900i[CPU0 ] CPU is in long mode (active)
01991983900i[CPU0 ] CS.mode = 64 bit
01991983900i[CPU0 ] SS.mode = 64 bit
01991983900i[CPU0 ] EFER = 0x00000d01
01991983900i[CPU0 ] | RAX=0000000000000000 RBX=0000000000000000
01991983900i[CPU0 ] | RCX=0000000077bae39f RDX=00000000bfebfbff
01991983900i[CPU0 ] | RSP=ffffffffffffff78 RBP=0000000000001abe <--- NOTE: RSP is not aligned
01991983900i[CPU0 ] | RSI=0000000000003422 RDI=0000000000002b28
01991983900i[CPU0 ] | R8=0000000000000000 R9=0000000000000000
01991983900i[CPU0 ] | R10=0000000000000000 R11=0000000000000000
01991983900i[CPU0 ] | R12=0000000000000000 R13=0000000000000000
01991983900i[CPU0 ] | R14=0000000000000000 R15=0000000000000000
01991983900i[CPU0 ] | IOPL=0 ID vip vif ac vm RF nt of df if tf SF zf AF PF cf
01991983900i[CPU0 ] | SEG sltr(index|ti|rpl) base limit G D
01991983900i[CPU0 ] | CS:0008( 0001| 0| 0) 00000000 0000ffff 0 0
01991983900i[CPU0 ] | DS:001b( 0003| 0| 3) 00000000 0fffffff 1 0
01991983900i[CPU0 ] | SS:0010( 0002| 0| 0) 00000000 0fffffff 1 0
01991983900i[CPU0 ] | ES:001b( 0003| 0| 3) 00000000 0fffffff 1 0
01991983900i[CPU0 ] | FS:001b( 0003| 0| 3) 00000000 0fffffff 1 0
01991983900i[CPU0 ] | GS:001b( 0003| 0| 3) 00000000 0fffffff 1 0
01991983900i[CPU0 ] | MSR_FS_BASE:0000000000000000
01991983900i[CPU0 ] | MSR_GS_BASE:0000000000000000
01991983900i[CPU0 ] | RIP=ffffffffffe05d87 (ffffffffffe05d87)
01991983900i[CPU0 ] | CR0=0xe0000011 CR2=0x0000000000000000
01991983900i[CPU0 ] | CR3=0x0000a000 CR4=0x00000368
(0).[1991983900] [0x00000013bd87] 0008:ffffffffffe05d87 (lang_init+27): movaps dqword ptr ss:[rsp+16], xmm0 ; 0f29442410
01991983900p[CPU0 ] >>PANIC<< exception(): 3rd (13) exception with no resolution
01991983900e[CPU0 ] WARNING: Any simulation after this point is completely bogus !
Next at t=1991983901
(0) [0x00000013bd87] 0008:ffffffffffe05d87 (lang_init+27): movaps dqword ptr ss:[rsp+16], xmm0 ; 0f29442410
<bochs:8> q
Let's take a look at the generated code.
Objdump:
Code:
ffffffffffe02080 <_start>:
ffffffffffe02080: fa cli
ffffffffffe02081: fc cld
...
ffffffffffe020d9: 66 87 db xchg %bx,%bx <--- first xchg that I used to print stack
ffffffffffe020dc: 6a 08 pushq $0x8
ffffffffffe020de: 68 20 5d e0 ff pushq $0xffffffffffe05d20
ffffffffffe020e3: 48 cb lretq
...
ffffffffffe05d20 <main>:
ffffffffffe05d20: 48 83 ec 08 sub $0x8,%rsp
ffffffffffe05d24: 31 c0 xor %eax,%eax
ffffffffffe05d26: e8 35 00 00 00 callq ffffffffffe05d60 <lang_init>
...
ffffffffffe05d60 <lang_init>:
ffffffffffe05d60: 41 57 push %r15
ffffffffffe05d62: 41 56 push %r14
ffffffffffe05d64: 41 55 push %r13
ffffffffffe05d66: 41 54 push %r12
ffffffffffe05d68: 55 push %rbp
ffffffffffe05d69: 53 push %rbx
ffffffffffe05d6a: 48 83 ec 38 sub $0x38,%rsp <--- this causes the misalignment
ffffffffffe05d6e: 8b 05 c9 1b 00 00 mov 0x1bc9(%rip),%eax # ffffffffffe0793d <platform_dbgputc+0x520>
ffffffffffe05d74: f3 0f 6f 05 b1 1b 00 movdqu 0x1bb1(%rip),%xmm0 # ffffffffffe0792d <platform_dbgputc+0x510>
ffffffffffe05d7b: 00
ffffffffffe05d7c: 89 44 24 20 mov %eax,0x20(%rsp)
ffffffffffe05d80: 0f b6 05 ba 1b 00 00 movzbl 0x1bba(%rip),%eax # ffffffffffe07941 <platform_dbgputc+0x524>
ffffffffffe05d87: 0f 29 44 24 10 movaps %xmm0,0x10(%rsp)
ffffffffffe05d8c: 88 44 24 24 mov %al,0x24(%rsp)
ffffffffffe05d90: 66 87 db xchg %bx,%bx <--- second xchg never reached
As you can see from bochs output, when _start jumps to main(), the stack is properly aligned. Main does nothing, just calls a function. I'd like to point out that when lang_init() is called, the stack is still properly aligned, because of "sub $8" and the 8 bytes return address "callq" pushes. So if anything is happening, that's happening with the code gcc generates into lang_init().
There's nothing important in the lang_init() function, no code that could influence the generation of "movaps". The first instruction in C is the second "xchg %bx, %bx" which is not reached. If anything is generated into the function before that,
that's 100% gcc's responsibility, such as the "sub $0x38, %rsp" instruction.
Now let's see what code is generated if we use "-O0". I haven't repeated _start as that hasn't changed.
Objdump:
Code:
ffffffffffe072fd <main>:
ffffffffffe072fd: 55 push %rbp
ffffffffffe072fe: 48 89 e5 mov %rsp,%rbp
ffffffffffe07301: b8 00 00 00 00 mov $0x0,%eax
ffffffffffe07306: e8 48 00 00 00 callq ffffffffffe07353 <lang_init>
...
ffffffffffe07353 <lang_init>:
ffffffffffe07353: 55 push %rbp
ffffffffffe07354: 48 89 e5 mov %rsp,%rbp
ffffffffffe07357: 48 83 ec 40 sub $0x40,%rsp
ffffffffffe0735b: 48 8b 05 4b 24 00 00 mov 0x244b(%rip),%rax # ffffffffffe097ad <platform_dbgputc+0x524>
ffffffffffe07362: 48 8b 15 4c 24 00 00 mov 0x244c(%rip),%rdx # ffffffffffe097b5 <platform_dbgputc+0x52c>
ffffffffffe07369: 48 89 45 c0 mov %rax,-0x40(%rbp)
ffffffffffe0736d: 48 89 55 c8 mov %rdx,-0x38(%rbp)
ffffffffffe07371: 8b 05 46 24 00 00 mov 0x2446(%rip),%eax # ffffffffffe097bd <platform_dbgputc+0x534>
ffffffffffe07377: 89 45 d0 mov %eax,-0x30(%rbp)
ffffffffffe0737a: 0f b6 05 40 24 00 00 movzbl 0x2440(%rip),%eax # ffffffffffe097c1 <platform_dbgputc+0x538>
ffffffffffe07381: 88 45 d4 mov %al,-0x2c(%rbp)
ffffffffffe07384: c7 45 e4 00 00 00 00 movl $0x0,-0x1c(%rbp)
ffffffffffe0738b: 66 87 db xchg %bx,%bx <--- reached without throwing a #GP
There's a "push %rbp", so with the return address pushed by "callq", the stack is still aligned, and %rsp has exactly the same memory address as with -O2 when lang_init() starts. No important change in main() then.
In lang_init() though, this time you'll find %rbp relative addressing mostly, and no instructions that could throw a general protection fault, therefore the second "xchg %bx, %bx" is reached as it should.
Conclusion: as the stack is properly aligned (rsp & 0xF == 0) on function call in both cases, therefore the stack misalignment is caused by a gcc optimizer bug.
Cheers,
bzt