OSDev.org

The Place to Start for Operating System Developers
It is currently Thu Mar 28, 2024 3:44 am

All times are UTC - 6 hours




Post new topic Reply to topic  [ 4 posts ] 
Author Message
 Post subject: SMP: Problems with starting APs
PostPosted: Sun Aug 07, 2022 8:06 am 
Offline

Joined: Sun Aug 07, 2022 7:32 am
Posts: 2
Location: probably his house
Hi,

I am writing a long mode OS kernel and I have implemented quite a few features (memory allocation, parsing ACPI tables, task switching) but I have had a few problems with multiprocessing. I have successfully initialized the local APIC on the BSP and I can send self-IPIs. I have also implemented the IO-APIC for the purposes of calibrating the time-stamp counter (for udelay()). Right now, I am trying to boot application processors.

This is the code I use for starting an AP (apboot.c):
Code:

extern volatile u8 __ap_flag;

void start_ap(struct cpu *ap)
{
   printk("starting CPU%u...\n", ap->cpu_id);

   __ap_flag = 0;

   apic_write_icr(ap->lapic_id, 0xc500); /* send INIT level assert */
   apic_wait_icr();

   apic_write_icr(ap->lapic_id, 0x8500); /* send INIT level de-assert */
   apic_wait_icr();

   udelay(10000);

   u32 apic_error;

   apic_read_esr();

   apic_write_icr(ap->lapic_id, 0x608);

   udelay(200);
   apic_wait_icr();

   apic_error = apic_read_esr();
   if(apic_error)
      panic("APIC error 0x%x when starting CPU%u", apic_error, ap->cpu_id);

   udelay(10000);

   if(!__ap_flag)
      panic("failed to start CPU%u", ap->cpu_id);

   printk("started CPU%u\n", ap->cpu_id);
}



Here are my implementations of APIC functions (if you'd need them, but you probably won't). The implementations for xAPIC and x2APIC reside in different files and I use a struct apic_interface to call these functions.
Code:

static void normal_apic_write_icr(u32 dst, u32 flags)
{
   mmapic_write_reg(MMAPIC_ICR_HIGH, dst << 24);
   mmapic_write_reg(MMAPIC_ICR_LOW, flags);
}

static void normal_apic_wait_icr(void)
{
   while(mmapic_read_reg(MMAPIC_ICR_LOW) & (1 << 12))
      asm volatile("pause" : : : "memory");
}

static void x2apic_write_icr(u32 dst, u32 flags)
{
   write_msr(MSR_X2APIC_ICR, ((u64) dst << 32) | flags);
}

static void x2apic_wait_icr(void) {}



My code panics and prints
Quote:
failed to start CPU1


This is the code that should run on the AP, shamelessly copied from SMP.
The linker script places it at 0x8000. (apboot.S, AT&T syntax):
Code:

.code16
.section .realmode

__ap_entry:   /* 0x8000 */
   cli
   cld
   ljmp $0, $0x8040

.align 16
__gdt32:   /* 0x8010 */
   .long 0, 0
   .long 0x0000ffff, 0x00cf9a00
   .long 0x0000ffff, 0x008f9200
   .long 0x00000068, 0x00cf8900
__gdt_ptr:   /* 0x8030 */
   .word 0x1f
   .long 0x8010
   .long 0, 0

.align 64
__ap_jmp_target: /* 0x8040 */
   xorw %ax, %ax
   movw %ax, %ds
   lgdtl 0x8030
   movl %cr0, %eax
   orl $1, %eax
   movl %eax, %cr0
   ljmp $8, $8060

.code32
.align 32
__ap_pm: /* 0x8060 */
   movw $16, %ax
   movw %ax, %ds
   movw %ax, %ss

   movb $1, __ap_flag

__hlt_loop:
   cli
   hlt
   jmp __hlt_loop

.align 16
.globl __ap_flag
__ap_flag:
   .byte 0



This part of the linker script causes .realmode to be placed at 0x8000:
Code:

ENTRY(_start)
SECTIONS
{
   /*
    * AP startup code needs to be placed below 1MB.
    * Therefore we give it a separate segment.
    */
   . = 0x8000;
   __realmode_start = .;
   .realmode :
   {
      *(.realmode)
   }

   . = ALIGN(4K);
   __realmode_end = .;

        ... (boot, text, data etc)

}



In bochs, I get the following logs:

Code:

00133390123i[CPU0  ] WRMSR: wrote 00000000:fee00800 to MSR_APICBASE
00133390123i[APIC0 ] allocate APIC id=0 (MMIO enabled) to 0x0000fee00000
00134628551i[APIC1 ] Deliver INIT IPI
00134628551i[CPU1  ] cpu software reset
00134628551i[APIC1 ] allocate APIC id=1 (MMIO enabled) to 0x0000fee00000
00134628551i[CPU1  ] CPU[1] is an application processor. Halting until SIPI.

// I removed some CPUID things, they're just unneeded

00134668576i[APIC1 ] Deliver Start Up IPI
00134668576i[CPU1  ] CPU 1 started up at 0800:00000000 by APIC



When I turn off the machine, I get the following logs:

Code:

00259752000p[XGUI  ] >>PANIC<< POWER button turned off.
========================================================================
Bochs is exiting with the following message:
[XGUI  ] POWER button turned off.
========================================================================
00259752000i[CPU0  ] CPU is in long mode (active)
00259752000i[CPU0  ] CS.mode = 64 bit
00259752000i[CPU0  ] SS.mode = 64 bit
00259752000i[CPU0  ] EFER   = 0x00000500
00259752000i[CPU0  ] | RAX=0000000000000000  RBX=ffff8000007ade28
00259752000i[CPU0  ] | RCX=000000000000000a  RDX=ffffffff81071c20
00259752000i[CPU0  ] | RSP=ffffffff81088eda  RBP=ffffffff81088f2a
00259752000i[CPU0  ] | RSI=0000000000000000  RDI=0000000000000004
00259752000i[CPU0  ] |  R8=0000000000000000   R9=0000000000000000
00259752000i[CPU0  ] | R10=0000000000000000  R11=ffffffff81088d8a
00259752000i[CPU0  ] | R12=ffffffff8106ea51  R13=ffffffff803b3814
00259752000i[CPU0  ] | R14=0000000080000000  R15=00000000003b326a
00259752000i[CPU0  ] | IOPL=0 ID vip vif ac vm rf nt of df if tf sf ZF af PF cf
00259752000i[CPU0  ] | SEG sltr(index|ti|rpl)     base    limit G D
00259752000i[CPU0  ] |  CS:0008( 0001| 0|  0) 00000000 ffffffff 1 0
00259752000i[CPU0  ] |  DS:0010( 0002| 0|  0) 00000000 ffffffff 1 1
00259752000i[CPU0  ] |  SS:0010( 0002| 0|  0) 00000000 ffffffff 1 1
00259752000i[CPU0  ] |  ES:0010( 0002| 0|  0) 00000000 ffffffff 1 1
00259752000i[CPU0  ] |  FS:0010( 0002| 0|  0) 00000000 ffffffff 1 1
00259752000i[CPU0  ] |  GS:0010( 0002| 0|  0) 7f746000 ffffffff 1 1
00259752000i[CPU0  ] |  MSR_FS_BASE:0000000000000000
00259752000i[CPU0  ] |  MSR_GS_BASE:ffff80007f746000
00259752000i[CPU0  ] | RIP=ffffffff8104f100 (ffffffff8104f100)
00259752000i[CPU0  ] | CR0=0xe0000011 CR2=0x0000000000000000
00259752000i[CPU0  ] | CR3=0x0000000001002000 CR4=0x000000a0
00259752000i[CPU1  ] CPU is in real mode (active)
00259752000i[CPU1  ] CS.mode = 16 bit
00259752000i[CPU1  ] SS.mode = 16 bit
00259752000i[CPU1  ] EFER   = 0x00000000
00259752000i[CPU1  ] | EAX=00000000  EBX=00000000  ECX=00000000  EDX=00000000
00259752000i[CPU1  ] | ESP=0000fff6  EBP=00005a8d  ESI=00000000  EDI=000081f4
00259752000i[CPU1  ] | IOPL=0 id vip vif ac vm rf nt of df if tf sf zf af pf cf
00259752000i[CPU1  ] | SEG sltr(index|ti|rpl)     base    limit G D
00259752000i[CPU1  ] |  CS:0000( 1e00| 0|  0) 00000000 0000ffff 0 0
00259752000i[CPU1  ] |  DS:0000( 0000| 0|  0) 00000000 0000ffff 0 0
00259752000i[CPU1  ] |  SS:0000( 0000| 0|  0) 00000000 0000ffff 0 0
00259752000i[CPU1  ] |  ES:0000( 0000| 0|  0) 00000000 0000ffff 0 0
00259752000i[CPU1  ] |  FS:0000( 0000| 0|  0) 00000000 0000ffff 0 0
00259752000i[CPU1  ] |  GS:0000( 0000| 0|  0) 00000000 0000ffff 0 0
00259752000i[CPU1  ] | EIP=0000000b (0000000b)
00259752000i[CPU1  ] | CR0=0x60000010 CR2=0x00000000
00259752000i[CPU1  ] | CR3=0x00000000 CR4=0x00000000
00259752000i[CMOS  ] Last time is 1659879937 (Sun Aug  7 15:45:37 2022)
00259752000i[XGUI  ] Exit
00259752000i[SIM   ] quit_sim called with exit code 1



I genuinely don't know how to debug this, so any help would be appreciated.

This is my first post on these forums, so if I did anything wrong, please say it.

_________________
Code:
struct signature {
        char data[256]; /* null-terminated, 255 chars */
}


Top
 Profile  
 
 Post subject: Re: SMP: Problems with starting APs
PostPosted: Mon Aug 08, 2022 10:45 pm 
Offline
Member
Member

Joined: Mon Mar 25, 2013 7:01 pm
Posts: 5099
dbstream wrote:
The linker script places it at 0x8000.

Which bootloader are you using? GRUB does not support loading anything below 0x100000.


Top
 Profile  
 
 Post subject: Re: SMP: Problems with starting APs
PostPosted: Tue Aug 09, 2022 1:28 am 
Offline
Member
Member

Joined: Wed Aug 30, 2017 8:24 am
Posts: 1593
I see what the problem is: You are not copying the startup code to the correct page. You think your bootloader loads the startup code to 0x8000, but it won't. What you have to do instead is to actively copy the trampoline to the destination.

That is why I have the trampoline as part of ".rodata". All memory references are relative to its start, and it has a header that gets filled out after loading. Also, that code you have there never goes to 64-bit mode. So here's my trampoline:
Code:
/* AMD-64 SMP boot trampoline */
.section ".rodata", "a", @progbits
.align 4
.global trampoline_size
trampoline_size: .long trampoline_end - trampoline
.code16
.global trampoline
trampoline:
/* Assumption: At run time, we start with CS:IP = xx00:0000.
*
* We have to:
*  -load a GDT
*  -load an IDT
*  -switch to PM
*  -enable LM+PAE
*  -enable paging (switch to LM)
*
* In order to do the first, we have to LGDT with an address usable in real
* mode. We have no guarantee that any of the kernel is loaded into the first
* 4GB, except everything in this trampoline. so the GDT itself has to be part
* of the trampoline. Same for the IDT pointer.
* So let us start with a data section:
*/
jmp start16

/* first the stuff to talk to the booting kernel: */
.align 4
kcr3: .long 0   /* kernel CR3. Must be within the first 4GB */
kcode: .quad 0 /* kernel code */
kstack: .quad 0 /* kernel stack */
kgs: .quad 0 /* kernel GS pointer */
commword: .word 0   /* communication word. We set bit 0 to indicate awakeness, and wait for bit 1 to indicate the booter has seen it. */

/* IDT pointer (misaligned, but 3 words long, so it aligns everything after it)
* Set IDTR to 0/0, so any exception causes a tripple fault.
*/
idt_ptr: .word 0
    .long 0

/* GDT: pointer embedded in first element */
/* Code 32 segment: Offset 8.
* Data 32 segment: Offset 16.
* Code 64 segment: Offset 24.
*/
gdt:
    .word 0, gdt_end - gdt - 1
    .long gdt - trampoline
gdtcode32: .quad 0x00cf9a000000ffff
gdtdata32: .quad 0x00cf92000000ffff
gdtcode64: .quad 0x00af9a000000ffff
gdt_end:

pmptr: .long start32 - trampoline, 0x08
lmptr: .long start64 - trampoline, 0x18

start16:
    xorl %ebx, %ebx
    movw %cs, %bx
    movw %bx, %ds
    lock orw $1, commword - trampoline
1:
    pause
    testw $2, commword - trampoline
    jz 1b

    shll $4, %ebx
    addl %ebx, gdt + 4 - trampoline
    addl %ebx, pmptr - trampoline
    addl %ebx, lmptr - trampoline

    lidtl idt_ptr - trampoline
    lgdtl gdt + 2 - trampoline

    movl %cr0, %eax
    btsl $0, %eax       /* enable PE bit */
    movl %eax, %cr0     /* now we are in protected mode */
    ljmpl *(pmptr - trampoline)  /* and now we are in 32-bit protected mode */

.code32
start32:
    movw $0x10, %ax
    movw %ax, %ds
    movw %ax, %es
    movw %ax, %fs
    movw %ax, %gs
    movw %ax, %ss
    leal 4096(%ebx), %esp       /* load temporary stack pointer */
    pushl $0
    popfl                       /* initialize all flags (including I and D flag) to 0 */
    movl %cr4, %eax
    btsl $5, %eax
    movl %eax, %cr4             /* set PAE bit in CR4 */
    movl $0xc0000080, %ecx
    rdmsr
    btsl $8, %eax               /* set LME bit in EFER */
    wrmsr
    movl kcr3 - trampoline(%ebx), %eax
    movl %eax, %cr3             /* load CR3 */
    movl %cr0, %eax
    btsl $31, %eax              /* set PG bit in CR0 */
    movl %eax, %cr0             /* now we are in Long Compatibility mode */
    ljmpl *(lmptr - trampoline) /* and now we are in Long 64-bit mode */

.code64
start64:
    orl %ebx, %ebx              /* clear upper half of RBX */
    movq kstack - trampoline(%rbx), %rsp
    movq kgs - trampoline(%rbx), %rax
    movq %rax, %rdx
    movl $0xc0000101, %ecx
    shrq $32, %rdx
    wrmsr                       /* set GS.base */
    callq   *(kcode - trampoline)(%ebx)
1:
    cli
    hlt
    jmp 1b                      /* emergency catch loop. Should never run */
trampoline_end:

There, that does it. It is position independent. It is supposed to be used like this:
  1. Allocate a physical page below 1MB
  2. Copy trampoline there
  3. Allocate a physical page below 4GB
  4. Copy PML4 there
  5. Identity map at least the trampoline inside the copied PML4
  6. Fill out kcr3, kcode, kstack, and kgs at the start
  7. Do the IPI dance (setting the SIPI vector such that startup happens at the allocated page)
  8. Wait for commword to get its lowest bit set
  9. Set the second lowest bit. The AP is now on the way
Sure, I could invent a relocation format such that the loading CPU would relocate all the pointers, then I wouldn't have to do it inside the trampoline, but I think doing it this way uncouples trampoline and kernel better. And I like loosely coupled components, it affords flexibility.

The commword is there to detect if the first SIPI worked or if you need a second one. Goes back to an idea by Brendan. The loop until acknowledgement is about memory ownership. The memory for the copied PML4 and the copied trampoline belongs to the booting routine until the AP boots up. Then it is owned by the booting AP. So the booting routine must relinquish ownership, must indicate that it has seen what is going on. Therefore the handshake.

_________________
Carpe diem!


Top
 Profile  
 
 Post subject: Re: SMP: Problems with starting APs
PostPosted: Tue Aug 09, 2022 6:29 am 
Offline

Joined: Sun Aug 07, 2022 7:32 am
Posts: 2
Location: probably his house
Problem solved!

I debugged in bochs and dumped raw physical memory at 0x8000, turns out grub didn't load the .realmode segment.

I changed my linker script:
Code:
-   /*
-    * AP startup code needs to be placed below 1MB.
-    * Therefore we give it a separate segment.
-    */
-   . = 0x8000;
-   __realmode_start = .;
-   .realmode :
-   {
-      *(.realmode)
-   }
-
-   . = ALIGN(4K);
-   __realmode_end = .;
   .text : AT(ADDR(.text) - 0xffffffff80000000)
   {
+      __realmode_start = .;
+      *(.realmode)
+      . = ALIGN(4K);
+      __realmode_end = .;

... and memcpy'd the realmode page to 0x8000:
Code:
+   void prepare_start_aps(void)
+   {
+      memcpy((void *) 0x8000, __realmode_start, __realmode_end - __realmode_start);
+   }

I also implemented a better algorithm for sending INIT-SIPI-SIPI:
Code:
void start_ap(struct cpu *ap)
{
   __write_ap_flag(0);
   __write_ap_task(get_cpuvar_address_on(ap, idle_task));

   apic_write_icr(ap->lapic_id, 0x4500); /* send INIT level assert */
   apic_wait_icr();

   apic_write_icr(ap->lapic_id, 0x8500); /* send INIT level de-assert */
   apic_wait_icr();

   udelay(500);

   u32 apic_error;

   for(int i = 0; i < 2; i++) {
      apic_write_icr(ap->lapic_id, 0x4608);

      udelay(200);

      apic_wait_icr();

      udelay(300);

      apic_error = apic_read_esr();
      if(apic_error)
         panic("APIC error 0x%x when starting CPU%u", apic_error, ap->cpu_id);

      if(__read_ap_flag() == 1)
         goto done;
   }

   udelay(10000);

   if(__read_ap_flag() != 1)
      panic("failed to start CPU%u", ap->cpu_id);
done:
   ...
}

... and actually implemented the switch to 64-bit mode (that I had left out to reduce how much I needed to debug. My original __ap_trampoline did set the __ap_flag before it went into a cli-hlt-loop):
Code:
__ap_trampoline:
   cli
   hlt
   jmp __ap_entry
.size __ap_entry, . - __ap_entry
   ljmp $0, $(__ap_jmp_target - __ap_trampoline + 0x8000)

__ap_init_gdt_table:
   .long 0, 0         /* null segment */
   .long 0x0000ffff, 0x00cf9a00   /* code segment */
   .long 0x0000ffff, 0x008f9200   /* data segment */
__ap_init_gdt_ptr:
   .word __ap_init_gdt_ptr - __ap_init_gdt_table - 1
   .long __ap_init_gdt_table - __ap_trampoline + 0x8000
__ap_jmp_target:
   xorw %ax, %ax
   movw %ax, %ds
   lgdtl __ap_init_gdt_ptr - __ap_trampoline + 0x8000
   movl %cr0, %eax
   orl $1, %eax
   movl %eax, %cr0
   ljmp $8, $(__ap32 - __ap_trampoline + 0x8000)

.code32
__ap32:
   movw $16, %ax
   movw %ax, %ds
   movw %ax, %es
   movw %ax, %fs
   movw %ax, %gs
   movw %ax, %ss

   movb $1, (__ap_flag - __ap_trampoline + 0x8000)

/* switch to 64-bit mode */

   movl %cr4, %eax
   orl $(1 << 5), %eax
   andl $~(1 << 7), %eax
   movl %eax, %cr4

   movl $__boot_pml4, %eax
   movl %eax, %cr3

   movl $0xc0000080, %ecx
   rdmsr
   orl $(1 << 8), %eax
   wrmsr

   movl %cr0, %eax
   orl $((1 << 31) | 1), %eax
   movl %eax, %cr0

   movl %cr4, %eax
   orl $(1 << 7), %eax
   movl %eax, %cr4

   lgdt (__ap_init_gdt64_ptr - __ap_trampoline + 0x8000)(,1)
   jmp $8, $(__ap64 - __ap_trampoline + 0x8000)

.code64
__ap64:
   movl $16, %eax
   movl %eax, %ds
   movl %eax, %es
   movl %eax, %fs
   movl %eax, %gs
   movl %eax, %ss

   movq (__ap_task - __ap_trampoline + 0x8000), %rsi

   movq (%rsi), %rsp

   popq %rdi
   popq %rsi
   popq %r15
   popq %r14
   popq %r13
   popq %r12
   popq %rbx
   popq %rbp
   ret

__ap_init_gdt64:

   .quad 0

/* kernel code segment - 8 */
   .long 0xffff
   .byte 0
   .byte 0b10011010
   .byte 0b10101111
   .byte 0

/* kernel data segment - 16 */
   .long 0xffff
   .byte 0
   .byte 0b10010010
   .byte 0b11001111
   .byte 0

__ap_init_gdt64_ptr:
   .word __ap_init_gdt64_ptr - __ap_init_gdt64
   .quad __ap_init_gdt64 - __ap_trampoline + 0x8000

.align 8
.globl __ap_task
__ap_task:
   .quad 0 /* filled in by C */
.globl __ap_flag
__ap_flag:
   .byte 0

I have a struct task for each CPU, a pointer to which I store in __ap_task. From that structure, I load the stack pointer and pop a bunch of registers before jumping to C code.
Code:
struct task {
   u64 sp;
   ... (things like mm, state, LL node for scheduler runqueue)
};

// task->sp points to this
struct initial_stack_frame {
   u64 rdi;
   u64 rsi;
   u64 r15;
   u64 r14;
   u64 r13;
   u64 r12;
   u64 rbx;
   u64 rbp;
   u64 rip;

   // for stack trace to work, these must be 0
   u64 null_rbp;
   u64 null_rip;
};

And now everything works as it should.
:-)

_________________
Code:
struct signature {
        char data[256]; /* null-terminated, 255 chars */
}


Top
 Profile  
 
Display posts from previous:  Sort by  
Post new topic Reply to topic  [ 4 posts ] 

All times are UTC - 6 hours


Who is online

Users browsing this forum: Bing [Bot] and 77 guests


You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot post attachments in this forum

Search for:
Jump to:  
Powered by phpBB © 2000, 2002, 2005, 2007 phpBB Group