Universal File System

Kevin McGuire · **Posted:** Mon Jan 16, 2006 11:33 am

[font=Lucida Console]
#define ENTRYTYPE_UNUSED         0x00
#define ENTRYTYPE_DIRECTORY         0x01
#define ENTRYTYPE_FILE            0x10
#define ENTRYTYPE_UNUSABLE         0x80
#define ENTRYTYPE_DELETEDDIR      0x81
#define ENTRYTYPE_DELETEDFILE      0x90

struct tMagic{
   unsigned char      m0;
   unsigned short      m1;
};
struct tSuperBlock{
   unsigned char      bootCode0[11];
   unsigned char      legacyBIOS[21];
   unsigned char      bootCode1[372];
   __int64            timeStamp;
   __int64            dataCount;
   __int64            indexCount;
   tMagic            magic;
   unsigned char      version;
   __int64            blockTotal;
   unsigned int      blockReserved;
   unsigned char      blockSize;
   unsigned char      checksum;
   unsigned char      compParitionTable[64];
   unsigned char      compBootSignature[2];

};
struct tIndexVolumeIndentiferEntry{
   unsigned char      type;
   struct{
      unsigned short      reserved0;
      unsigned char      reserved1;
   }reserved;
   tTimeDate         timeDate;
   unsigned char      name[52];
};

struct tIndexStandardEntry{
   unsigned char      type;
   unsigned char      flags;
   unsigned short      continuationCount;
   __int64            timeDate;
   __int64            blockStart;
   __int64            blockEnd;
   __int64            fileLength;
   unsigned char      name[20];
};
struct tIndexContinuationEntry{
   unsigned char      name[64];
};
struct tIndexStartingMarkerEntry{
   unsigned char      type;
   unsigned char      reserved[63];
};

unsigned int mmAllocPage(void);                              // allocates page
void mmAllocFreePage(unsigned int page);                     // free page
unsigned int fsReadBlock(__int64 blockIndex, unsigned int address);   // read fs block
unsigned int fsWriteBlock(__int64 blockIndex, unsigned int address);// write fs block

unsigned int fsDefragment(tSuperBlock * superBlock){
   tIndexStandardEntry * stdEntry;


   __int64 dataBlockIndex = superBlock->blockReserved;
   __int64 dataBlockClosest = 0;
   __int64 dataBlockMove;
   __int64 found;
   unsigned int memData = mmAllocPage();
   unsigned int tmpData = mmAllocPage();

   __int64 bi;
   for(;;){
      found = superBlock->blockTotal;      /* We use this to compute a between range of last defragment block, and the largest block. */
      /* cycle through all blocks in the FS using the index area. */
      for(bi = superBlock->blockTotal - superBlock->indexCount; bi < superBlock->blockTotal; bi++){
         if(!fsReadBlock(bi, memData))
            return 0;               /* error reading block */
         stdEntry = (tIndexStandardEntry*)memData;
         /* Remove deleted things. This can corrupt the FS, after defrag. */
         if((stdEntry->type == ENTRYTYPE_DELETEDDIR) | (stdEntry->type == ENTRYTYPE_DELETEDFILE)){
            stdEntry->type = ENTRYTYPE_UNUSED;
            if(!fsWriteBlock(bi, memData))
               return 0;
         }
         /* make sure the entry is a directory or file */
         if((stdEntry->type == ENTRYTYPE_DIRECTORY)|(stdEntry->type = ENTRYTYPE_FILE))
            /* if we have previously found one, make sure this one is closer to dataBlockClosest than
             our previous find. We assume that the index entrys are in ANY order!
          */
            if((stdEntry->blockStart >= dataBlockClosest) & (stdEntry->blockStart < found))
               found = stdEntry->blockStart;
         /*
            Jump past any continuation blocks.
         */
         bi += stdEntry->continuationCount;
      }
      /*
         This means that dataBlockClosest was so big. It exceeded any known file in the system. This means we
         have defragmented every file possible.
      */
      if(found == superBlock->blockTotal)
         break;                     /* we are done. */
      /*
         Add one to jump out of the actual data block for a file, and into the next avaible block.
         The next availible block could be used. It might not. Hopefully it points to some slacks pace
         so the next cycle of for(BI.. will find it.
      */
      dataBlockClosest = stdEntry->blockEnd + 1;
      /* --------------- defragment found entry -------------------- */
      /* How many blocks down should we shift? */
      dataBlockMove = stdEntry->blockStart - dataBlockClosest;
      /* None. The file's data was already contingious with the last. */
      if(dataBlockMove > 0){
         /* Lets move the data using dataBlockMove as the shift down count. */
         for(__int64 mi = 0; mi < stdEntry->blockEnd - stdEntry->blockStart; mi++){
            /* read it */
            if(!fsReadBlock(bi + mi, tmpData))
               return 0;
            /* write it - shifted down #datablockMoves# number of times. */
            if(!fsWriteBlock(bi + mi - dataBlockMove, tmpData))
               return 0;
         }
         /* update the index block on what we did. */
         dataBlockClosest -= dataBlockMove;                  /* dont forget we changed the disk structor */
         stdEntry->blockStart -= dataBlockMove;
         stdEntry->blockEnd -= dataBlockMove;
         if(!fsWriteBlock(bi, memData))                     /* update stdEntry */
            return 0;
      }
      /* try another */
   }
   return 1;         /* we are all done. */
}
[/font]

Kevin McGuire · **Posted:** Mon Jan 16, 2006 12:14 pm

I also wanted to note:

I used the design of reading and writting one block at a time, to show the arithogram working using the smallest amount of memory. However, the entire image could be read into memory and manipulated much faster. Im sorry. I should have done it using the entire image loaded into memory. That would have simplified it alot.

Rob · **Posted:** Mon Jan 16, 2006 5:02 pm

My "thoughts" on the new proposal:

1. index entries are actually not 64 bytes long but 56 in
the current implementation. So entry name needs to grow
to 28 bytes! However, I'm wondering the following:

0x00 - 1 - Entry type
0x01 - 1 - Number of continuation entries following this entry
0x02 - 8 - Time stamp
0x0A - 8 - Starting block number in the data area
0x12 - 8 - Ending block number in the data area
0x1A - 8 - File length in bytes
0x22 - 30 - Entry name in UTF8

0x00 Unused entry
0x01 for the Volume Identifier
0x02 Directory entry
0x03 Starting marker entry
0x10 File entry
0x80 Unusable entry
0x82 Deleted directory entry
0x90 Deleted file

This gets us two extra bytes for the entry name and
a total of 30 + (255 * 64) = 16350 which is imho more
than enough. However, if you truly want unlimited
path names why not simply stop at the continuation
entry that has the \0 terminator in it? No real reason
to store the count in that case. If you decide to go
down that road the second byte could be turned into
a reserved byte (0 for now), that could in the future
incorporate perhaps something else (like attribute flags?).

2. when I create a file in windows it says the following characters are not allowed (from what I can generate on my keyboard): / \ : * ? " < > |

so I can make a file "=;,+"! Which would not fit in this filesystem. Why aren't those characters allowed?

personally I actually use comma and plus sign sometimes

3. just to be sure, did you see my previous remark on a timestamp for directories? This so a rename updates the timestamp on the directory as well. Or a tool could touch it to let you know the contents has been updated. Since the bytes are there anyway....

4. the continuation entries most be consecutive, right? That isn't mentioned

5. the idea (I assume) is that the index area can grow down
when you are running out of room?

6. I'm thinking the starting marker should be a required thing
to implement. Here's why. If the superblock would be corrupt
or unreadable you don't know if there is a starting marker.
So recovery software could find a starting marker where there
might not be one (this may lead to corruption). It should be
a relative easy feature to implement. Just put it in the first
entry, and if you need to grow the index fill that entry with
normal data and move the marker to the new place (you need to
update the super block anyway)

7. "where every file has one entry" should read "where every file
has one or more entries"

I'm off to bed

Pype.Clicker · **Posted:** Tue Jan 17, 2006 4:54 am

considering the repeated concerns with the file names stuff, i'd tend to suggest to follow the kind of approach that one can find in an executable file, that is

- the 'file entries' are compact. They only contain 'fixed size' stuff such as timestamps, file size, sectors information, etc.
- all the file names are collated together into the "names section" (that could be, for instance, another "special area" or just a specific file entry such as the first one, or the file entry with type==X).

A file entry would therefore not carry the filename but instead the offset of its name in the "names area". That makes reading from the camel ultra-simple: load all the "names file", browse the files entries using names[entry->fname] rather than entry->fname for comparison.

Extending the idea, you could have a couple of "names" rather than a single one: the "prefix" name (reused by all the files in the same directory) and the "own" name (e.g. the filename "per se").

For systems short on RAM, the offset within the name area can be directly translated into a single block to be loaded.

anyone interested ?

Candy · **Posted:** Tue Jan 17, 2006 4:58 am

Before we continue, could we set up a list of requirements for this file system? I expect that would allow us to cancel out a few discussions directly, such as supporting fragmentation versus not supporting it, variable inode count or static, fixed file name length or variable, stuff like that.

Brendan · **Posted:** Tue Jan 17, 2006 5:49 am

Hi,

Rob wrote:

This gets us two extra bytes for the entry name and
a total of 30 + (255 * 64) = 16350 which is imho more
than enough. However, if you truly want unlimited
path names why not simply stop at the continuation
entry that has the \0 terminator in it? No real reason
to store the count in that case. If you decide to go
down that road the second byte could be turned into
a reserved byte (0 for now), that could in the future
incorporate perhaps something else (like attribute flags?).

The problem here is detecting if an entry is a continuation entry or not (if you're unlucky, some file names would cause a continuation entry to look like a standard entry). For example, the UTF character code 0x00C2 (Latin capital letter 'A' with circumflex) is encoded as 2 bytes, 0xC3, 0x82. If the second byte of the character happens to be the first byte in a continuation entry, then software could interpret it as the type byte for a deleted directory entry. In the same way the UTF character code 0x00D0 (Latin capital letter ETH) could look like an entry for a deleted file.

The previous version of the specification prevented this by making sure that the first byte of any entry (except for continuation entries) is a value that isn't possible in a valid file name. Due to the way UTF8 encoding works, this can be any value between 0x01 and 0x1F (inclusive).

However, you are correct in that the entry type and the flags for standard entries could be combined into a single byte (as long as the only values used are between 0x01 and 0x1F), which would give an extra 2 bytes for entry names (the "number of continuation entries" was only really 2 bytes for alignment).

Rob wrote:

2. when I create a file in windows it says the following characters are not allowed (from what I can generate on my keyboard): / \ : * ? " < > |

so I can make a file "=;,+"! Which would not fit in this filesystem. Why aren't those characters allowed?

Originally I thought that it'd be better to limit the characters when a file is created, rather than allowing characters that some OSs (like DOS) would have problems with. For example, in DOS the command "copy foo.txt+bar.txt new.txt" will combine both files ("foo.txt" and "bar.txt") into a single file ("new.txt").

I'm not so sure this was a good idea now, as an old OS like DOS will have huge problems with UTF8 and file name lengths anyway. Linux, OS X and modern versions of Windows can handle these characters...

Rob wrote:

3. just to be sure, did you see my previous remark on a timestamp for directories? This so a rename updates the timestamp on the directory as well. Or a tool could touch it to let you know the contents has been updated. Since the bytes are there anyway....

I'm not sure the benefits of this would justify the extra code it'd take or the extra writes it'd need to implement. Can you think of a situation where this would be useful?

Rob wrote:

4. the continuation entries most be consecutive, right? That isn't mentioned

5. the idea (I assume) is that the index area can grow down
when you are running out of room?

Yes and yes - I'll clarify this for the next version of the specification.

Rob wrote:

6. I'm thinking the starting marker should be a required thing
to implement. Here's why. If the superblock would be corrupt
or unreadable you don't know if there is a starting marker.
So recovery software could find a starting marker where there
might not be one (this may lead to corruption). It should be
a relative easy feature to implement. Just put it in the first
entry, and if you need to grow the index fill that entry with
normal data and move the marker to the new place (you need to
update the super block anyway)

File recovery isn't something that would actually happen often. For e.g. most people would throw the floppy in the bin and just copy the orginal files to a different floppy and try again. When file recovery is done the recovery utility will need to handle read errors from all over the media - for example, the super-block, several sectors in the index area and several sectors in the data area might be unreadable. In this case the sector containing the starting marker may be unreadable.

Still, there isn't really any reason not to put a starting marker or to make the starting marker a requirement.

Rob wrote:

7. "where every file has one entry" should read "where every file has one or more entries"

Being fixed!

Thanks,

Brendan

JoeKayzA · **Posted:** Tue Jan 17, 2006 5:56 am

Rob wrote:

However, if you truly want unlimited
path names why not simply stop at the continuation
entry that has the \0 terminator in it? No real reason
to store the count in that case. If you decide to go
down that road the second byte could be turned into
a reserved byte (0 for now), that could in the future
incorporate perhaps something else (like attribute flags?).

That came to my mind too, but that means that you'll have to calculate the string length (iterate until the NULL-byte is reached) everytime you iterate over an index entry. This could become a serious performance hog, so during a simple optimization you'll end up calculating the name length once and store it in memory - which is quite exactly what we are doing here in the index entry itself.

cheers Joe

Brendan · **Posted:** Tue Jan 17, 2006 7:17 am

Hi,

Pype.Clicker wrote:

considering the repeated concerns with the file names stuff, i'd tend to suggest to follow the kind of approach that one can find in an executable file, that is

- the 'file entries' are compact. They only contain 'fixed size' stuff such as timestamps, file size, sectors information, etc.
- all the file names are collated together into the "names section" (that could be, for instance, another "special area" or just a specific file entry such as the first one, or the file entry with type==X).

A file entry would therefore not carry the filename but instead the offset of its name in the "names area". That makes reading from the camel ultra-simple: load all the "names file", browse the files entries using names[entry->fname] rather than entry->fname for comparison.

Extending the idea, you could have a couple of "names" rather than a single one: the "prefix" name (reused by all the files in the same directory) and the "own" name (e.g. the filename "per se").

For systems short on RAM, the offset within the name area can be directly translated into a single block to be loaded.

anyone interested ?

Typical disk media is like a long line of blocks (or sectors), with 2 ends. With 2 variable sized areas this works well - e.g. a variable size "data area" at one end with a variable sized "index area" at the other, with free space in the middle to allow both to grow.

The easiest way to implement three variable sized areas (without over-complicating the code necessary to manage them) would be to find some media with 3 ends. Instead of having a long line of blocks you'd want a triangle, so that each variable sized area can grow from it's corner towards the middle, and the middle would contain the free space.

Candy wrote:

Before we continue, could we set up a list of requirements for this file system? I expect that would allow us to cancel out a few discussions directly, such as supporting fragmentation versus not supporting it, variable inode count or static, fixed file name length or variable, stuff like that.

I'd assume the requirements would be derived from the purpose of the file system. The purpose of the file system would be to provide a modern common format that allows files to be transfered between different "systems" (where a "system" is a computer with an OS and a pile of resources, or an embedded device without an OS and very limited resources, or anything else that deals with files). It is not in any way intended to be a replacement for any OS's "native" file system (e.g. NTFS, ext3, reiserFS, etc).

IMHO the requirements would be:

IMHO the purpose of the file system also makes some things "less than important".

The purpose of the file system makes it's usage different too - I'd expect that most of the time people will format the media, copy files to it, take it somewhere else, copy or move the files from it to something else, and then format it again for next time.

Not sure if any of this helps though....

Cheers,

Brendan

Pype.Clicker · **Posted:** Tue Jan 17, 2006 7:26 am

Brendan wrote:

The easiest way to implement three variable sized areas (without over-complicating the code necessary to manage them) would be to find some media with 3 ends. Instead of having a long line of blocks you'd want a triangle, so that each variable sized area can grow from it's corner towards the middle, and the middle would contain the free space.

Okay. I better see it now. i guess that makes your design decisions clearer to understand.

Oh, btw, i can give you a media with 3 ends if you want: area 'A' grows from sector 0 and extends upwards only on even sectors, area 'B' grows from sector 1 and extends upwards only on odd sectors while area 'X' grows from the last sector downwards using both odd and even sectors

It would probably not be so practical to use, however ... don't take it seriously ;D

Candy · **Posted:** Tue Jan 17, 2006 7:48 am

It's asking for a 1-dimensional objects with three limits. There are only a maximum and a minimum, and that's all you get.

Brendan · **Posted:** Tue Jan 17, 2006 11:06 am

Hi,

I've updated the draft specification again to take into account the recent discussions.

I still haven't done anything in the "algorithms" part yet - I think I'll wait until the main part of the specification stops changing..

It's available from my web site at:

http://bcos.hopto.org/sfs.html

Cheers,

Brendan

Candy · **Posted:** Tue Jan 17, 2006 12:21 pm

Brendan wrote:

I still haven't done anything in the "algorithms" part yet - I think I'll wait until the main part of the specification stops changing..

I'll wait for making the driver until the specification itself is set. At least, until it's somewhat certain.

df · **Joined:** Fri Oct 22, 2004 11:00 pm **Posts:** 1076

ive been too busy lately, so i'm a bit late

i'd base it like minix.

this is not designed for large disk FS replacement, for your 256TB rack... im thinking more floppies, flash etc. it set to handle >4gb but not 2^64... max file size is 2gb (for 1kb blocks)... bad block bitmap, good for bad flash etc...

layout;
superblock
none / some reserved space
bad block bitmap
used block bitmap
inodes
root

the superblock has LBA pointers to all things so it can be in any order in any part of disk, thus you could resize inode data etc.
minix does inodes = blocks/3 thing, which is pretty general and seems to work well. if its not enough you can always grow it (as long as its contiguous)

superblock
- int64 - magic
- int64 - number of blocks on disk
- int64 - reserved blocks following first block
- int64 logical block address (LBA) to bitmap of used blocks
- int64 LBA to bitmap of bad blocks
- int64 LBA to inodes
- int16 - block size / 512 (1=512, 2=1k, 8=4k, 16=16k)
32 bytes - utf8 volume name
byte - flags (0x01 - dirty)

id rather have volumename in the superblock than waste an attribute bit just to store it in a directory as a single use 1 off item.

inode
int64 - filesize
int32- last mod datetime stamp (seconds since.. 1900)
1 byte - attributes
int64[3] - direct blocks LBA
int64 - single indirect block LBA
int64 - double indirect block LBA
int64 - triple indirect block LBA
reserved 3 bytes (rounds out to 64)

attributes
0x01 - directory
0x02 - exec
0x04 - read
0x08 - write
0x10 - na
0x20 - na
0x40 - hidden
0x80 - system

directory entry
int32 - inode number
62 bytes - utf8 filename

the first inode (0) is always root directory inode.
the first directory entry is always a backlink to parent directory inode the ".."

Assembler · **Posted:** Tue Jan 17, 2006 5:01 pm

Pype.Clicker wrote:

considering the repeated concerns with the file names stuff, i'd tend to suggest to follow the kind of approach that one can find in an executable file, that is

- the 'file entries' are compact. They only contain 'fixed size' stuff such as timestamps, file size, sectors information, etc.
- all the file names are collated together into the "names section" (that could be, for instance, another "special area" or just a specific file entry such as the first one, or the file entry with type==X).

A file entry would therefore not carry the filename but instead the offset of its name in the "names area". That makes reading from the camel ultra-simple: load all the "names file", browse the files entries using names[entry->fname] rather than entry->fname for comparison.

Extending the idea, you could have a couple of "names" rather than a single one: the "prefix" name (reused by all the files in the same directory) and the "own" name (e.g. the filename "per se").

For systems short on RAM, the offset within the name area can be directly translated into a single block to be loaded.

anyone interested ?

I totally agree with pype, an I-node based fs will be better it will allow hard links & symbolic links.
And adding a new block for filename will allow for better naming schemes we will not tied up for 8 bytes for name or so...
This will also make it easy for UTF-8 or any other encoded filenames.

Assembler,

Kemp · **Posted:** Tue Jan 17, 2006 7:17 pm

I would prefer that soft-links (that's what windows uses for shortcuts right?) were supported. Though doesn't windows just store them as a file that tells it where to go anyway?

OSDev.org

Universal File System

Who is online