A dynamic kernel memory allocator

sunnysideup · **Joined:** Sat Feb 08, 2020 11:11 am **Posts:** 106

Hello! I've been making a dynamic memory allocator. I'd love to hear your inputs and suggestions if I'm going wrong somewhere.... The idea is to have a minimal structure allocated at compile time (kernel_heap and kernel_heap_pool). The current implementation has a limited number of entries in the kernel_heap_pool, and perhaps soon I can use some sort of linked list structure to number of entries, where the first pool is allocated at compile time and the rest is a part of the heap itself.

Code:

//This is my implementation of a kernel heap. It is a bunch of virtual memory "filled" with physical addresses.

//Note: every address in the heap needs to be accounted for by the linked list!!! The "useful" entries, will be the only ones part of the linked list. TODO: Extend heap entry pool using linked list
#include <stdint.h>
#include <stdbool.h>
#include <stddef.h>

#include <mem.h>

#define PAGE_SIZE 4096
#define MAX_HEAP_SIZE 0x400000 //4M for now 
#define MAX_HEAP_ENTRIES 256//Max addresses the heap gives you
#define STANDARD_INCREASE 16 //Increases the size of the heap by 16 pages
#define MIN_ALLOC_SIZE 16 //A kalloc will give you min 16 bytes!!

extern uint32_t __kernel_heap[];//Ensure that this is page aligned

typedef struct _heap_entry 
{
    bool is_useful; //Part of heap??

    void* start_address;
    uint32_t size;
    bool is_hole;
    struct _heap_entry* next_entry;
    struct _heap_entry* prev_entry;
}heap_entry;

typedef struct _heap
{
    //Static stuff
    uint32_t max_entry_count;
    uint32_t max_size;

    //Dynamic stuff
    uint32_t entry_count;
    uint32_t size;
    void* heap_start_address;
    heap_entry* start_entry;
    heap_entry* last_entry;

}heap;


static heap_entry heap_entry_pool[MAX_HEAP_ENTRIES];
static heap kernel_heap;

static heap_entry* allocate_heap_entry()
{
  uint32_t lazy =0;

  for(uint32_t i=0;i<MAX_HEAP_ENTRIES;i++)
  {
    uint32_t temp = (lazy+i)%MAX_HEAP_ENTRIES;
    if(heap_entry_pool[temp].is_useful) continue;
    heap_entry_pool[temp].is_useful = true;
    lazy = temp+1;
    kernel_heap.entry_count++;
    return &heap_entry_pool[temp];
  }
  //TODO: Perhaps we can use a linked list to grow this list dynamically too!!
  //The last pool entry can point to a new heap pool present in the heap
  return NULL;
}

static bool expand_heap(uint32_t page_count)
{
  uint32_t req_size = page_count * PAGE_SIZE;
  if (req_size + kernel_heap.size > kernel_heap.max_size) goto failure;
  void* vma = kernel_heap.heap_start_address + kernel_heap.size;
  for(uint32_t i=0;i<page_count;i++)
  {
    uint32_t pma = pmmngr_allocate_block();
    if(!pma) goto failure;
    if(!map_page(vma,pma,false,true)) goto failure; //(vma,pma,isuser,iswritable)
    vma += PAGE_SIZE;
  }
  //Now, vma "holds" a size of page_count;

  heap_entry* pointer = kernel_heap.last_entry;
  if(pointer -> is_hole)
    pointer -> size += req_size;
  else
  {
    heap_entry* temp = allocate_heap_entry();
    if(!temp) goto failure;
    temp -> start_address = vma;
    temp -> size = req_size;
    temp -> is_hole = true;
    temp -> next_entry = NULL; //Last entry
    temp -> prev_entry = pointer;
    pointer -> next_entry = temp;
    kernel_heap.last_entry = temp;
  }
  kernel_heap.size += req_size;
  return true;

  failure: //TODO: Handle failures better - revert already mapped pages
  return false;
}



static void initialize_kernel_heap()
{
    for(int i = 0;i< MAX_HEAP_ENTRIES;i++)
        heap_entry_pool[i].is_useful = false;
    
    kernel_heap.max_size = MAX_HEAP_SIZE;
    kernel_heap.max_entry_count = MAX_HEAP_ENTRIES;
    
    kernel_heap.entry_count = 0;
    kernel_heap.size = 0;
    kernel_heap.heap_start_address = __kernel_heap;
    kernel_heap.start_entry = allocate_heap_entry();
    kernel_heap.last_entry = kernel_heap.start_entry;
     
    kernel_heap.start_entry->start_address = __kernel_heap;
    kernel_heap.start_entry->size = 0;
    kernel_heap.start_entry->is_hole = true;
    kernel_heap.start_entry->next_entry = NULL;
    kernel_heap.start_entry->prev_entry = NULL;
}

static void* kernel_heap_alloc(uint32_t size) //This is the main part... It allocs stuffon heap
{
  while(1) //Keep expanding the heap
  {
    for(heap_entry* iterator = kernel_heap.start_entry ; iterator ; iterator=iterator->next_entry)
    {
      if (!iterator -> is_hole) continue;
      if (iterator -> size == size)
      {
        iterator -> is_hole = false;
        return iterator -> start_address;
      }
      if (iterator -> size > size)
      {
        heap_entry* temp = allocate_heap_entry();
        if(!temp) goto failure;
        temp -> start_address = iterator -> start_address + size;
        temp -> size = (iterator -> size) - size;
        temp -> is_hole = true;
        temp -> next_entry = iterator -> next_entry;
        temp -> prev_entry = iterator;

        iterator -> size = size;
        iterator -> is_hole = false;
        iterator -> next_entry = temp;

        return iterator -> start_address;
      }
    }
    //Now, we've seen that the heap is small 
    if(!expand_heap(STANDARD_INCREASE)) goto failure;
  }
failure:
  return NULL;
}

static bool kernel_heap_free(void* address)
{
  //Find the corresponding entry... Maybe we can have a binary tree or something
  for(heap_entry* iterator = kernel_heap.start_entry ; iterator ; iterator=iterator->next_entry)
  {
    if ((uint32_t)address > (uint32_t)iterator->start_address) goto failure;
    if (address == iterator->start_address)
    {
      if(iterator->next_entry && iterator->next_entry->is_hole) //If next entry is free join it
      {
        iterator -> size += iterator -> next_entry -> size;
        iterator -> next_entry -> is_useful = false;
        kernel_heap.entry_count --;
        iterator -> next_entry = iterator -> next_entry -> next_entry;
        if(iterator -> next_entry) iterator -> next_entry -> prev_entry = iterator;
      }
      if(iterator->prev_entry && iterator->prev_entry->is_hole) 
      {
        iterator -> start_address = iterator -> prev_entry -> start_address;
        iterator -> size += iterator -> prev_entry -> size;
        iterator -> prev_entry -> is_useful = false;
        kernel_heap.entry_count --;
        iterator -> prev_entry = iterator -> prev_entry -> prev_entry;
        if(iterator -> prev_entry) iterator -> prev_entry -> next_entry = iterator;
      }
      iterator->is_hole = true;
      if (!iterator -> next_entry) kernel_heap.last_entry = iterator; //TODO: Maybe free the pages too??
      return true;
    }
  }
failure:
  return false;
}
static bool is_kheap = false;
void* kalloc(uint32_t size)
{
  if(size < MIN_ALLOC_SIZE) size = MIN_ALLOC_SIZE;
  if (!is_kheap)
  {
    initialize_kernel_heap();
    is_kheap = true;
  }
  return kernel_heap_alloc(size);
}

bool kfree(void* address)
{
  void* h_st =  kernel_heap.heap_start_address;
  void* h_en =  h_st + kernel_heap.size;
  if((address < h_st) || (address > h_en) || !is_kheap) return false;
  return kernel_heap_free(address);
}

nullplan · **Joined:** Wed Aug 30, 2017 8:24 am **Posts:** 1604

So, this is basically a standard dlmalloc except for a few key details:

The heap is not split into buckets, and
The chunk structure is stored in a static list externally rather than internally

Personally, I'm not a big fan of either change. The first means the allocator has to search for a chunk that is large enough and the latter means that the free function also has to search for the correct chunk structure. This means both now have a search loop included, so the runtime is linear in the number of active chunks.

I'm quite fond of musl's malloc implementation, although it has a known problem[1] that Rich is currently addressing with a complete rewrite I could not yet wrap my head around. Anyway, the current implementation means in the single threaded case that malloc() needs to increase heap size at most once, and that free() is constant time, too, so you might want to look into adapting that implementation. Or, if you want something different and really secure, look into adapting omalloc, OpenBSD's version of malloc. Centerpiece of that thing is a hash table of all pointers ever handed out. It is not the fastest malloc ever written, but it is quite resilient against misuse, and it also stores chunk boundaries externally, so most buffer overflows will not destroy the heap, and invalid free()s will be detected.

[1]: The problem is, BTW, that in the multi-threaded case, it can happen that if two threads are allocating memory at the same time, or one is returning memory and another is allocating, that one thread temporarily allocates the entire heap and then is suspended, and then the other thread sees that it must expand the heap because no memory is left. So two threads running "free(malloc(50))" in a loop over and over can consume unbounded memory.

thewrongchristian · **Joined:** Tue Apr 03, 2018 2:44 am **Posts:** 402

sunnysideup wrote:

Hello! I've been making a dynamic memory allocator. I'd love to hear your inputs and suggestions if I'm going wrong somewhere.... The idea is to have a minimal structure allocated at compile time (kernel_heap and kernel_heap_pool). The current implementation has a limited number of entries in the kernel_heap_pool, and perhaps soon I can use some sort of linked list structure to number of entries, where the first pool is allocated at compile time and the rest is a part of the heap itself.

Limiting the number of entries is a bad idea. Having a maximum number of allocations at any one point is a bad move, the heap is supposed to be a general purpose memory allocation, bounded by available memory rather than available statically allocated buffers. I understand you're keeping it simple and it's baby steps, but this sort of structure is just not going to be scalable and you'll soon outgrow it for the following reasons:

* Arbitrary number of allocations, as already mentioned.
* Linear searching of data structures. As you increase the number of allocations, you'll be increasing the size of these linear searches.
* I think the code relies on allocations being stored in address order. If you rely on this going forward, you'll have to maintain the structures in order, which isn't well suited to arrays or linked lists. You'd want a binary tree at least.
* You currently have no locking. When you add locking, and you'll need locking if you ever have more than one thread being able to allocate memory, then you'll have a bottleneck.

Most kernels these days tend to use a slab based allocators, which address many of the above issues:

* Slabs are some multiple of page size, and so it is easy to take an arbitrary pointer and find the container slab and associated structures.
* Slabs provide a natural point of synchronization, so that multiple threads can allocate from different slabs concurrently with minimal contention.
* Allocation and freeing an allocation of a slab chunk involves simple bit map manipulation (fast on most hardware) to find or free the chunk.

The only limitation of slab allocators is the maximum size of a single allocation is somewhat less than the size of your slab, so for allocations that are larger than your slab size, you'll need something else. But these should be rare, because larger allocations tend to be:

* Page based allocations, for things like assigning pages to handle page faults.
* Allocating memory for I/O - Tend to require aligned memory anyway.

These use cases wouldn't suit your allocator either, so this is no big problem. But if you do have bigger allocations, you can defer to an alternative allocator which allocates in multiples of page size, and can be selected based on the allocation size requested.

sunnysideup · **Joined:** Sat Feb 08, 2020 11:11 am **Posts:** 106

I think I can solve the limited entry problem by perhaps using the last entry to point to allocate a new pool of nodes in the heap and so on.. How would this work?

thewrongchristian · **Joined:** Tue Apr 03, 2018 2:44 am **Posts:** 402

sunnysideup wrote:

I think I can solve the limited entry problem by perhaps using the last entry to point to allocate a new pool of nodes in the heap and so on.. How would this work?

Badly, as you'll still be doing a linear search on allocate and free.

If you want to go down this route, instead of using a slab allocator, then track each chunk using a binary tree. That will allow you to quickly check pointers are valid, as a valid pointer will also correspond to an active chunk. You'll also need a separate structure to track free chunks, which you could also do by address, and use first fit to keep searches short, or track chunks by size, so you can find a chunk of sufficient size quickly.

To boot, I cannot overstate how useful a general purpose binary tree is within the kernel anyway, it's a data structure you can use again and again, so it's worth implementing. My tree maps from intptr_t keys to intptr_t values, with wrapper macros to allow its use with pointer keys and/or values. But you will have a chicken and egg scenario, where the tree nodes would ideally be allocated from a heap. Again, this isn't a problem a slab based allocator, so that as well points to a slab allocator being your best bet.

If you implement the binary tree inline (using some of the allocated chunk for the node data) that can remove the chicken and egg problem, at the expense of generality. Perhaps not a problem in the long term, you can generalize later if required.

But my own 2c is research and implement a slab based allocator. You shouldn't need such a general purpose allocator in the kernel able to allocate arbitrarily big blocks. Just define a reasonable pool of slab types.

AndrewAPrice · **Posted:** Mon May 18, 2020 2:57 pm

It would be fun to write an allocator one of these days.

My idea would be to keep track of the unallocated memory. Unallocated memory would be represented by:

Code:

struct UnallocatedMemory {
  size_t length;

  // Bitwise trie entry of the start addresses of free memory.
  UnallocatedMemory* start_addr_parent;
  UnallocatedMemory* start_addr_smaller_child;
  UnallocatedMemory* start_addr_larger_child;

  // Bitwise trie entry of the length of this free memory chunk.
  UnallocatedMemory* length_parent;
  UnallocatedMemory* length_parent_smaller_child;
  UnallocatedMemory* length_parent_larger_child;
  UnallocatedMemory* length_parent_next_child;
};

You would store this struct at the start of the unallocated memory. On a 64-bit machine, this struct would be 64 bytes, so you wouldn't be able to have empty holes smaller than 64 bytes in memory.

There would be two bitwise tries. One for the start address of the free memory, one for the size of the free memory range.

Bitwise tries allow for the fast implementation of "get node at key, and if doesn't exist, get the next [higher/lower] key". So, let's say you want a 1024 chunk of memory, we can implement "smallest fit" by finding an UnallocatedMemory block with a length of 1024, and if one doesn't exist of that size, we'd find the next highest, which we could split (if the new gap is larger than 64 bytes.) And, if this returns none, we can request pages from the physical allocator/kernel.

Allocated memory would take up 8 bytes (assuming 64-bit addresses) followed by the requested memory size. This is so when we call free(), we can check memory_ptr - 8 to get the allocated memory's size, and during the process of turning it back into UnallocatedMemory, we could check the start address trie to see if we can merge it with the previous/next memory chunk.

You would only need global two pointers (pointers to the parent nodes of the two tries).

I'd love to hear ideas on how we could handle multiple threads. Perhaps give each thread it's own UnallocatedMemory tries (16 bytes per thread), and if we can't find a fit during allocation, we ask other threads before making a system call for another page? (And on thread destruction, do we merge our UnallocatedMemory tries with the next thread's tries? This would require kernel support to notify the application when a thread terminates?)

sunnysideup · **Joined:** Sat Feb 08, 2020 11:11 am **Posts:** 106

Perhaps I can have the binary tree as an 'add-on'.. After the heap index list grows a decent size, I can allocate a pool of tree nodes on the heap and initialize the tree in the right way (to track the heap entries).This deals with the chicken and egg problem perhaps.

Perhaps I'll invest in a slab allocator as well

The trie implementation looks interesting, I'll have a look at that as well

OSDev.org

A dynamic kernel memory allocator

Who is online