diff --git a/docs/mem_layout.md b/docs/mem_layout.md
new file mode 100644
index 0000000..b4ec4d3
--- /dev/null
+++ b/docs/mem_layout.md
@@ -0,0 +1,22 @@
+# Physical memory layout
+
+## Low memory
+
+| Type     | Range               |                                                 |
+|----------|---------------------|-------------------------------------------------|
+| Reserved | 0x000000 - 0x000400 | real-mode interrupt vector table                |
+| Reserved | 0x000400 - 0x000500 | bios data area                                  |
+| Usable   | 0x000500 - 0x004000 | main stack                                      |
+| Usable   | 0x004000 - 0x006a00 | globals                                         |
+| Usable   | 0x006a00 - 0x007c00 | memory map                                      |
+| Usable   | 0x007c00 - 0x007e00 | boot sector                                     |
+| Usable   | 0x007e00 - 0x080000 | conventional usable memory                      |
+| Reserved | 0x080000 - 0x0a0000 | extended bios data area (maximum possible size) |
+| Reserved | 0x0a0000 - 0x0c0000 | video memory                                    |
+| Reserved | 0x0c0000 - 0x0c8000 | video bios                                      |
+| Reserved | 0x0c8000 - 0x0f0000 | bios expansions                                 |
+| Reserved | 0x0f0000 - 0x100000 | motherboard bios                                |
+
+TODO: ensure that we don't exceed 
+TODO: once we're in real mode, repurpose s2 and s3 for a stack
+TODO: load s4 into a separate memory region
diff --git a/include/defines.s b/include/defines.s
index ae70d7a..4dc935a 100644
--- a/include/defines.s
+++ b/include/defines.s
@@ -62,13 +62,3 @@
 %define VGA_COL                0x1c
 %define TEXTBUF_LINE           0x1e
 %define MEMMAP_ENTRIES         0x20
-
-%macro fnstart 0
-  push bp
-  mov bp, sp
-%endmacro
-
-%macro fnret 0
-  pop bp
-  ret
-%endmacro
diff --git a/include/fn.s b/include/fn.s
new file mode 100644
index 0000000..dc5bc83
--- /dev/null
+++ b/include/fn.s
@@ -0,0 +1,14 @@
+%ifndef BOOT_FN_H
+%define BOOT_FN_H
+
+%macro fnstart 0
+  push bp
+  mov bp, sp
+%endmacro
+
+%macro fnret 0
+  pop bp
+  ret
+%endmacro
+
+%endif
diff --git a/include/layout.s b/include/layout.s
new file mode 100644
index 0000000..2ec1220
--- /dev/null
+++ b/include/layout.s
@@ -0,0 +1,18 @@
+%ifndef BOOT_LAYOUT_H
+%define BOOT_LAYOUT_H
+
+%define S1_ADDR 0x7c00
+%define S2_ADDR 0x8200
+
+; %define MEMMAP          0x6a00
+; %define MEMMAP_END      S2_ADDR
+; %define MEMMAP_ENT_SIZE 32
+; %define MEMMAP_CAP      ((MEMMAP_END - MEMMAP) / MEMMAP_ENT_SIZE)
+
+%define REAL_GLOBALS     0x4000
+%define REAL_GLOBALS_END 0x6a00
+%define REAL_STACK_BASE  REAL_GLOBALS
+
+%define S234_MAGIC 0x544e4150
+
+%endif
diff --git a/include/s1_vars.s b/include/s1_vars.s
new file mode 100644
index 0000000..e9f080d
--- /dev/null
+++ b/include/s1_vars.s
@@ -0,0 +1,28 @@
+%ifndef BOOT_S1_VARS_H
+%define BOOT_S1_VARS_H
+
+; Stage 1 base stack frame variable offsets / globals
+; (we use the same offsets once we copy the variables to the globals section)
+; -------------------------------------------------------------------------------------------------
+; The boot drive number given to us by the BIOS.
+%define BOOT_DRIVE             0x02
+; Boot drive geometry
+%define SECTORS_PER_TRACK      0x04
+%define N_HEADS                0x06
+; Starting LBA of the GPT partition entries array.
+%define GPT_ENTRIES_START_LBA  0x08
+; Number of GPT entries, saturated to 16 bits.
+%define GPT_N_ENTRIES_16       0x0a
+; Number of sectors to advance by once we've read every GPT entry in the current sector.
+%define GPT_SECTOR_STRIDE      0x0c
+; Number of bytes to advance by in the current sector once we've read a GPT entry.
+%define GPT_BYTE_STRIDE        0x0e
+; Number of GPT entries which can fit in a single sector.
+%define GPT_ENTRIES_PER_SECTOR 0x10
+%define GPT_CURRENT_ENTRY_IDX  0x12
+%define GPT_SECTOR_ENTRY_IDX   0x14
+%define GPT_SECTORS_LOADED     0x16
+%define GPT_CURRENT_LBA        0x18
+%define STAGE_2_GPT_ENTRY_ADDR 0x1a
+
+%endif
diff --git a/include/s2_fns.s b/include/s2_fns.s
new file mode 100644
index 0000000..e4defd1
--- /dev/null
+++ b/include/s2_fns.s
@@ -0,0 +1,8 @@
+%ifndef BOOT_S2_FNS_H
+%define BOOT_S2_FNS_H
+
+extern addr32_to_addr16
+extern read_sector
+extern panic_simple
+
+%endif
diff --git a/justfile b/justfile
index 48d813e..9a1e5d2 100644
--- a/justfile
+++ b/justfile
@@ -1,3 +1,6 @@
+include_flags := "-Iinclude"
+common_flags := "-werror " + include_flags
+
 run:
   qemu-system-x86_64 \
     -monitor stdio \
@@ -6,11 +9,19 @@ run:
     -m 512M \
     -drive format=raw,file=disk.bin
 
-
 build:
-  nasm -f bin -Iinclude -o boot0.bin boot0.s
-  cd boot1; cargo build --release
-  # nasm -f bin -Iinclude -o boot1.bin boot1.s
+  nasm -f bin {{common_flags}} -o s1.bin stages/s1/s1.s
+  nasm -f elf -werror -Iinclude -o stages/s2/s2.o stages/s2/s2.s
+  nasm -f elf -werror -Iinclude -o stages/s3/s3.o stages/s3/s3.s
+  nasm -f elf -werror -Iinclude -o stages/s3/a20.o stages/s3/a20.s
+
+  ld -T s2.ld -o s234.bin stages/s2/*.o stages/s3/*.o
+# TODO: try with gnu ld
+
+# build:
+#   nasm -f bin -Iinclude -o boot0.bin boot0.s
+#   cd boot1; cargo build --release
+#   # nasm -f bin -Iinclude -o boot1.bin boot1.s
 
 zero_disk:
   dd if=/dev/zero of=disk.bin bs=512 count=1000
@@ -31,10 +42,10 @@ partition_disk:
   parted --script disk.bin mkpart stage2 70s 900s
   parted --script disk.bin type 6 fdffea69-3651-442f-a11d-88a09bf372dd
 
-write_stage1:
-  dd if=/dev/zero of=disk.bin bs=440 count=1 conv=notrunc
-  dd if=boot0.bin of=disk.bin conv=notrunc
+# write_stage1:
+#   dd if=/dev/zero of=disk.bin bs=440 count=1 conv=notrunc
+#   dd if=boot0.bin of=disk.bin conv=notrunc
 
-write_stage2:
-  # dd if=boot1.bin of=disk.bin bs=512 seek=70 conv=notrunc
-  dd if=boot1/target/target_protected/release/boot1 of=disk.bin bs=512 seek=70 conv=notrunc
+# write_stage2:
+#   # dd if=boot1.bin of=disk.bin bs=512 seek=70 conv=notrunc
+#   dd if=boot1/target/target_protected/release/boot1 of=disk.bin bs=512 seek=70 conv=notrunc
diff --git a/s2.ld b/s2.ld
new file mode 100644
index 0000000..dd7697b
--- /dev/null
+++ b/s2.ld
@@ -0,0 +1,34 @@
+OUTPUT_FORMAT("binary")
+
+. = 0x8200;
+
+SECTIONS {
+  /* Stage 2 must come first so it's in the single sector loaded by stage 1. */
+  .s2_text : {
+    KEEP(*(.s2_text))
+    *(.s2_text)
+  }
+
+  .s3_text : {
+    KEEP(*(.s3_text))
+    *(.s3_text)
+  }
+
+  .s3_data : {
+    KEEP(*(.s3_data))
+    *(.s3_data)
+  }
+  
+  .magic : {
+    /* Magic bytes stage 2 uses to make sure it's loaded the subsequent sectors correctly. */
+    LONG(0x544e4150)
+  }
+
+  s234_magic = ADDR(.magic);
+
+  /* Define a symbol for the total length of the binary, so the prelude knows how many blocks to
+   * load from disk.
+   */
+  s234_bin_len = . - 0x8200;
+  s234_bin_sectors = (s234_bin_len + 511) / 512;
+}
diff --git a/stages/s1/s1.s b/stages/s1/s1.s
new file mode 100644
index 0000000..c5763a2
--- /dev/null
+++ b/stages/s1/s1.s
@@ -0,0 +1,252 @@
+%include "layout.s"
+%include "s1_vars.s"
+
+[org S1_ADDR]
+[bits 16]
+
+main:
+  cli
+
+  xor ax, ax
+
+  mov ds, ax
+  mov es, ax
+
+  ; Put the stack base at 0x4000.
+  ; Stack grows high->low, so we'll grow away from our globals and program text.
+  mov ss, ax
+  mov bp, REAL_STACK_BASE
+  mov sp, bp
+
+  ; Segment for VGA (0xb800 * 16 = 0xb8000)
+  mov ax, 0xb800
+  mov fs, ax
+
+  ; Set VGA mode
+  ; https://mendelson.org/wpdos/videomodes.txt
+  mov ax, 0x0003
+  int 0x10
+
+  ; Store boot drive number
+  xor dh, dh
+  push dx
+
+  ; Get drive geometry
+  mov di, 0x00
+  mov ah, 0x08
+  int 0x13
+  jc panic
+  ; Load sectors per track into cx & spill
+  and cl, 0x3f
+  xor ch, ch
+  push cx
+  ; Load number of heads into bx & spill
+  movzx bx, dh
+  inc bx
+  push bx
+
+  ; Load LBA 1.
+  mov ax, 1
+  mov bx, 0x7e00
+  call read_lba
+
+  ; Check the GPT header magic "EFI PART"
+  mov cx, GPT_MAGIC_LEN
+  mov si, gpt_magic
+  mov di, 0x7e00
+  repe cmpsb
+  jne panic
+
+  ; Ensure the 8-byte GPT starting LBA fits in 16 bits
+  mov di, 0x7e00 ; The rep increments di so we need to reset it
+  mov eax, [di + 0x4c]
+  mov bx, [di + 0x4a]
+  or ax, bx
+  or eax, eax
+  jnz panic
+  ; Store the first 16 bits of the GPT starting LBA (we have made sure the remaining bits are 0)
+  push word [di + 0x48]
+
+  ; Load number of partitions
+  mov ax, [di + 0x50]
+  mov bx, [di + 0x52]
+  or bx, bx
+  jz .gpt_n_partitions_loaded
+  ; Number of partitions overflows 16 bits, so we just concern ourselves with the first 65535.
+  ; That's an awful lot of partitions anyway.
+  mov ax, 0xffff
+.gpt_n_partitions_loaded:
+  push ax
+
+  ; Load GPT entry size
+  mov eax, [di + 0x54] ; Operand size override otherwise this is going to be painful
+  mov ebx, eax
+  ; Assert that the entry size is 128 * 2^n for some integer n>=0. This is required for a valid GPT
+  ; and has the nice properties that:
+  ; - If each entry is larger than a sector (512 bytes), they'll be sector-aligned.
+  ; - If each entry is smaller than a sector, an integer number of them will fit into a sector.
+  or eax, eax  ; Test size != 0 because 128 * 2^n != 0
+  jz panic
+  test eax, 127 ; Test size is a multiple of 128
+  jnz panic
+  ; Use the (n & (n - 1)) == 0 trick to test if the entry size is a power of 2. Since we already
+  ; know it's a nonzero multiple of 128, if size is a power of 2 then size = 128 * 2^n holds.
+  ; Therefore we don't need to bother dividing by 128 first (shr 7), which saves a couple of bytes.
+  mov ecx, ebx
+  dec ecx
+  and ecx, eax
+  jnz panic
+
+  ; Find the "sector stride", which is the number of sectors we increment by each time we want to
+  ; load a new entry.
+  shr eax, 9      ; Divide by sector size to get sectors per entry
+  cmp eax, 0xffff ; Make sure sectors per entry fits in 16 bits
+  ja panic
+  or ax, ax
+  jnz .gpt_sector_stride_loaded
+  ; Sector stride must be at least one or we'll load the same sector each time!
+  inc ax
+.gpt_sector_stride_loaded:
+  push ax
+
+  ; Find the "byte stride", which is the number of bytes we increment by each time we want to load
+  ; the next entry in the same sector.
+  cmp ebx, 512
+  jb .gpt_find_entries_per_sector
+  push word 0 ; Arbitrary byte stride since there's only one entry per sector
+  push word 1 ; 1 entry per sector, since an entry is larger than a sector
+  jmp .gpt_found_entries_per_sector
+.gpt_find_entries_per_sector:
+  push bx     ; Store byte stride = entry length in this case
+  xor dx, dx
+  mov ax, 512
+  div bx      ; Find entries per sector
+  push ax
+.gpt_found_entries_per_sector:
+
+  ; Set up stack variables for our second stage search loop.
+  xor ax, ax
+  push ax                                ; Current entry
+  push ax                                ; Current entry within the current sector
+  push ax                                ; Number of sectors loaded
+  push word [bp - GPT_ENTRIES_START_LBA] ; Current LBA
+
+  ; Search for the partition storing our second stage.
+.loop_find_stage2:
+  mov dx, [bp - GPT_CURRENT_ENTRY_IDX]
+  cmp [bp - GPT_N_ENTRIES_16], dx
+  ; Panic if we've run out of partitions and haven't found the second stage yet.
+  jbe panic
+
+  ; If we haven't loaded any sectors yet, load the first one.
+  cmp word [bp - GPT_SECTORS_LOADED], 0
+  je .load_first_lba
+  ; If there's still more entries in the current sector, skip loading a new sector
+  mov ax, [bp - GPT_SECTOR_ENTRY_IDX]   ; Load current entry index within the current sector
+  cmp [bp - GPT_ENTRIES_PER_SECTOR], ax ; Compare to entries per sector
+  ja .process_current_entry
+  
+  mov ax, [bp - GPT_SECTOR_STRIDE]        ; Load sector stride
+  add word [bp - GPT_CURRENT_LBA], ax     ; Increment current LBA by sector stride
+  mov word [bp - GPT_SECTOR_ENTRY_IDX], 0 ; Reset the current entry index within the current sector
+.load_first_lba:
+  ; Read the current LBA to 0x8000 (just past the end of the GPT header)
+  mov ax, [bp - GPT_CURRENT_LBA]
+  mov bx, 0x8000
+  call read_lba
+  ; Increment number of sectors loaded
+  inc word [bp - GPT_SECTORS_LOADED]
+
+.process_current_entry:
+  ; Calculate the address of the current GPT entry.
+  mov ax, [bp - GPT_SECTOR_ENTRY_IDX] ; Load current entry index within current sector
+  xor dx, dx
+  mul word [bp - GPT_BYTE_STRIDE]     ; Get the byte offset in the current sector of the current entry
+  add ax, 0x8000                      ; Convert offset to address (we loaded the sector at 0x8000)
+
+  ; Compare entry GUID to our stage 2 partition GUID.
+  mov cx, GUID_LEN
+  mov si, guid_stage2
+  mov di, ax
+  repe cmpsb
+  je .found_stage2
+
+  ; Next iteration
+  inc word [bp - GPT_CURRENT_ENTRY_IDX] ; Increment current entry index
+  inc word [bp - GPT_SECTOR_ENTRY_IDX]  ; Increment current entry index within the current sector
+  jmp .loop_find_stage2
+
+.found_stage2:
+  push ax ; Address of the GPT entry for stage 2
+  mov si, ax
+
+  ; Load partition LBA start.
+  mov eax, [si + 0x20]
+  mov ebx, [si + 0x24]
+  ; Ensure it fits in 16 bits.
+  or ebx, ebx
+  jnz panic
+  cmp ebx, 0xffff
+  ja panic
+  ; Load partition LBA end.
+  mov ecx, [si + 0x28]
+  mov edx, [si + 0x2c]
+  ; Assert that the end LBA is greater than or equal to the start LBA, so we have at least one
+  ; sector to load (end LBA is inclusive).
+  or edx, edx
+  jnz .stage2_end_lba_ok
+  cmp eax, ecx
+  ja panic
+.stage2_end_lba_ok:
+
+  mov bx, S2_ADDR
+  call read_lba
+  jmp bx
+
+  ; Load a single boot disk sector. Panic on failure.
+  ; Inputs:
+  ; - ax: LBA to load
+  ; - bx: address to read sector to
+  ; Clobber: ax, cx, dx
+read_lba:
+  ; sector - 1 = LBA  % sectors_per_track
+  ; temp       = LBA  / sectors_per_track
+  ; head       = temp % n_heads
+  ; cylinder   = temp / n_heads
+  xor dx, dx
+  ; Divide by sectors per track. dx = mod (sector - 1), ax = div (temp)
+  div word [bp - SECTORS_PER_TRACK]
+  ; Put the sector into cx (the bios call will use cl)
+  mov cx, dx
+  inc cx
+  xor dx, dx
+  ; Divide by number of heads. dx = mod (head), ax = div (cylinder)
+  div word [bp - N_HEADS]
+  mov dh, dl
+  mov ch, al
+  mov dl, byte [bp - BOOT_DRIVE]
+  mov ah, 0x02
+  mov al, 1
+  ; Read sector
+  int 0x13
+  jc panic
+  ret
+
+panic:
+  mov ax, 0x0003
+  int 0x10
+  mov word fs:[0x0000], 0x4f21
+  hlt
+
+gpt_magic     db "EFI PART"
+GPT_MAGIC_LEN equ $ - gpt_magic
+
+; Our stage2 guid: fdffea69-3651-442f-a11d-88a09bf372dd
+guid_stage2 db 0x69, 0xea, 0xff, 0xfd, 0x51, 0x36, 0x2f, 0x44, \
+               0xa1, 0x1d, 0x88, 0xa0, 0x9b, 0xf3, 0x72, 0xdd
+GUID_LEN   equ $ - guid_stage2
+
+; MBR bootstrap field is 440 bytes long
+%if ($ - $$) > 440
+%error "exceeded mbr bootstrap field size"
+%endif
diff --git a/stages/s2/s2.o b/stages/s2/s2.o
new file mode 100644
index 0000000..09adf8d
Binary files /dev/null and b/stages/s2/s2.o differ
diff --git a/stages/s2/s2.s b/stages/s2/s2.s
new file mode 100644
index 0000000..7e922b8
--- /dev/null
+++ b/stages/s2/s2.s
@@ -0,0 +1,217 @@
+[bits 16]
+
+%include "fn.s"
+%include "layout.s"
+%include "s1_vars.s"
+
+extern s234_bin_len
+extern s234_bin_sectors
+extern s234_magic
+extern s3_main
+extern s234_bin_len
+extern s234_bin_sectors
+extern s234_magic
+
+section .s2_text
+
+%macro copy_stack_var_to_globals 2
+  mov %1, [bp - %2]
+  mov [REAL_GLOBALS + %2], %1
+%endmacro
+
+; Load stages 3 and 4 into memory.
+load_s234:
+  ; Now that we're not doing instruction byte golf like we were in stage 1, we can afford to move
+  ; the various stage 1 stack variables to the globals section.
+  copy_stack_var_to_globals ax, BOOT_DRIVE
+  copy_stack_var_to_globals ax, SECTORS_PER_TRACK
+  copy_stack_var_to_globals ax, N_HEADS
+  copy_stack_var_to_globals ax, GPT_ENTRIES_START_LBA
+  copy_stack_var_to_globals ax, GPT_N_ENTRIES_16
+  copy_stack_var_to_globals ax, GPT_SECTOR_STRIDE
+  copy_stack_var_to_globals ax, GPT_BYTE_STRIDE
+  copy_stack_var_to_globals ax, GPT_ENTRIES_PER_SECTOR
+  copy_stack_var_to_globals ax, GPT_CURRENT_ENTRY_IDX
+  copy_stack_var_to_globals ax, GPT_SECTOR_ENTRY_IDX
+  copy_stack_var_to_globals ax, GPT_SECTORS_LOADED
+  copy_stack_var_to_globals ax, GPT_CURRENT_LBA
+  copy_stack_var_to_globals ax, STAGE_2_GPT_ENTRY_ADDR
+
+  ; Reset the stack, now we've got everything we need from it.
+  mov sp, bp
+
+  mov si, [REAL_GLOBALS + STAGE_2_GPT_ENTRY_ADDR]
+  mov eax, [si + 0x20] ; Partition / boot1 start LBA lower
+  mov ebx, [si + 0x24] ; Partition / boot1 start LBA upper
+  mov ecx, [si + 0x28] ; Partition end LBA lower
+  mov edx, [si + 0x32] ; Partition LBA upper
+
+  ; Panic if the partition / boot1 starting LBA overflows 16 bits.
+  or ebx, ebx
+  jnz panic_simple
+  ror eax, 16
+  or ax, ax
+  jnz panic_simple
+  ror eax, 16
+
+  ; Calculate the s234 end LBA and panic if it overflows 16 bits.
+  ; n.b. ebx is zero before this so both bx and ebx can be used as the s234 end LBA.
+  mov bx, ax
+  add bx, s234_bin_sectors
+  jc panic_simple
+
+  ; Panic if the s234 end LBA is after the partition end LBA.
+  ; If the upper 32 bits of the partition end LBA are nonzero, then it must be greater than our
+  ; 16-bit s234 end LBA.
+  or edx, edx
+  jnz .end_lba_ok
+  ; Compare the s234 end LBA to the lower 32 bits of the partition end LBA.
+  cmp ebx, ecx
+  ja panic_simple
+
+.end_lba_ok:
+
+  ; The first sector has already been loaded (we're running it right now!) so increment the
+  ; current LBA.
+  inc ax
+  push ax                     ; Current LBA
+  push bx                     ; s234 end LBA
+  mov ebx, S2_ADDR + 512 ; Current sector load address
+
+.load_loop:
+  mov ax, [bp - 0x02]      ; Load current LBA
+  cmp word [bp - 0x04], ax ; Compare to s234 end LBA
+  jb .load_done
+
+  mov ecx, ebx
+  call read_sector
+  jc panic_simple
+  
+  add ebx, 512
+  inc word [bp - 0x02]
+  jmp .load_loop
+
+.load_done:
+
+  ; Check the magic bytes at the end of s234.
+  push es
+  mov ebx, s234_magic
+  call addr32_to_addr16
+  cmp dword es:[bx], S234_MAGIC
+  pop es
+  jne panic_simple
+  
+  jmp s3_main
+
+
+; Converts a 32-bit address to a 16-bit sector and offset.
+; Arguments:
+; - ebx: 32-bit address
+; Return:
+; - es: 16-bit address segment (unchanged on failure)
+; - ebx: 16-bit address offset
+; - cf: unset on success, set on failure
+; Clobber: none
+addr32_to_addr16:
+  fnstart
+  push es
+  push eax
+
+  mov eax, ebx
+  ; Divide addr by 16 and saturate to 16 bits to get the segment.
+  shr eax, 4
+  ror eax, 16
+  or ax, ax
+  jz .segment_ok
+  mov eax, 0xffff0000
+.segment_ok:
+  ror eax, 16
+  mov es, ax
+
+  ; Calculate offset = addr - (16 * segment), failing if the offset doesn't fit in 16 bits.
+  shl eax, 4
+  sub ebx, eax
+  ror ebx, 16
+  or bx, bx
+  jnz .fail
+  ror ebx, 16
+  
+  pop eax
+  add sp, 2 ; Discard the original es from the stack
+  pop bp
+  clc
+  ret
+
+.fail:
+  pop eax
+  pop es
+  stc
+  fnret
+
+global addr32_to_addr16
+
+
+; Reads a single sector at the given LBA into memory.
+; Arguments:
+; - ax: start LBA
+; - ecx: address to read sector to
+; Return:
+; - cf: unset on success, set on failure
+; Clobber: eax, ecx, edx
+read_sector:
+  ; sector - 1 = LBA  % sectors_per_track
+  ; temp       = LBA  / sectors_per_track
+  ; head       = temp % n_heads
+  ; cylinder   = temp / n_heads
+
+  fnstart
+  push es
+  push ebx
+  
+  mov ebx, ecx
+  call addr32_to_addr16
+  jc .return
+  
+  ; Calculate sector and temp
+  xor dx, dx
+  ; Divide by sectors per track. dx = mod (sector - 1), ax = div (temp)
+  div word [REAL_GLOBALS + SECTORS_PER_TRACK]
+  ; Put the sector into cx (the bios call will use cl)
+  mov cx, dx
+  inc cx
+  
+  ; Calculate head and cylinder
+  xor dx, dx
+  ; Divide by number of heads. dx = mod (head), ax = div (cylinder)
+  div word [REAL_GLOBALS + N_HEADS]
+  mov dh, dl
+  mov ch, al
+
+  mov dl, byte [REAL_GLOBALS + BOOT_DRIVE]
+  mov ah, 0x02
+  mov al, 1
+  ; Read sector
+  int 0x13
+
+.return:
+  pop ebx
+  pop es
+  fnret
+
+global read_sector
+
+
+panic_simple:
+  mov ax, 0x0003
+  int 0x10
+  mov word fs:[0x0000], 0x4f21
+.halt:
+  hlt
+  jmp .halt
+
+global panic_simple
+
+
+%if ($ - $$) > 512
+%error "stage 2 exceeded sector size"
+%endif
diff --git a/stages/s3/a20.o b/stages/s3/a20.o
new file mode 100644
index 0000000..d265326
Binary files /dev/null and b/stages/s3/a20.o differ
diff --git a/stages/s3/a20.s b/stages/s3/a20.s
new file mode 100644
index 0000000..e37120e
--- /dev/null
+++ b/stages/s3/a20.s
@@ -0,0 +1,112 @@
+%include "fn.s"
+%include "ps2.s"
+
+
+%macro mov_out 3
+  mov %1, %3
+  out %2, %1
+%endmacro
+
+
+; Check whether the A20 line is enabled. Writes to the boot sector identifier.
+; Arguments: none
+; Return:
+; - ax: 0 if A20 disabled, nonzero if A20 enabled
+; Clobber: none
+test_a20:
+  push bp
+  mov bp, sp
+  push gs
+  
+  ; Restore the boot sector identifier in case it was overwritten by anything.
+  mov word [0x7dfe], 0xaa55
+
+  mov ax, 0xffff
+  mov gs, ax
+  xor ax, ax
+
+  ; If the word at 0x107dfe (1 MiB after the boot sector identifier) is different to the boot
+  ; sector identifier, than A20 must be enabled.
+  cmp word gs:[0x7e0e], 0xaa55
+  setne al
+  jne .return
+
+  ; Even if A20 was enabled, the two words may have been equal by chance, so we temporarily swap
+  ; the boot sector identifier bytes and test again.
+  ror word [0x7dfe], 8
+  cmp word gs:[0x7e0e], 0x55aa
+  setne al
+  ror word [0x7dfe], 8
+  jmp .return
+
+.return:
+  pop gs
+  pop bp
+  ret
+
+global test_a20
+
+
+; Try to enable A20 using the Intel 8042 PS/2 keyboard controller.
+; Arguments: none
+; Return: none
+; Clobber: ax, cx, dx
+enable_a20_intel_8042:
+  ; Temporarily disable the keyboard.
+  call intel_8042_wait_write
+  mov_out al, INTEL_8042_OUT_CMD, INTEL_8042_CMD_PS2_1_DISABLE
+
+  ; Read the controller output port.
+  call intel_8042_wait_write
+  mov_out al, INTEL_8042_OUT_CMD, INTEL_8042_CMD_CONTROLLER_OUT_PORT_READ
+  call intel_8042_wait_read
+  in al, INTEL_8042_IO_DATA
+
+  ; The second bit is "A20 enabled", so set it.
+  mov cl, al
+  or cl, 2
+
+  ; Write the modified byte back to the controller output port.
+  call intel_8042_wait_write
+  mov_out al, INTEL_8042_OUT_CMD, INTEL_8042_CMD_CONTROLLER_OUT_PORT_WRITE
+  call intel_8042_wait_write
+  mov_out al, INTEL_8042_IO_DATA, cl
+
+  ; Re-enable the keyboard.
+  call intel_8042_wait_write
+  mov_out al, INTEL_8042_OUT_CMD, INTEL_8042_CMD_PS2_1_ENABLE
+
+  ; Wait for writes to finish.
+  call intel_8042_wait_write
+
+  ret
+
+global enable_a20_intel_8042
+
+
+; Wait for the Intel 8042 input buffer to become empty, so we can write.
+; Arguments: none
+; Return: none
+; Clobber: al
+intel_8042_wait_write:
+.loop:
+  ; Read the 8042 status register.
+  in al, INTEL_8042_IN_STATUS
+  ; Input buffer status flag set means the input buffer is full, so loop in this case.
+  test al, INTEL_8042_STATUS_MASK_IBUF
+  jnz .loop
+  ret
+
+
+; Wait for the Intel 8042 output buffer to become filled, so we can read.
+; Arguments: none
+; Return: none
+; Clobber: al
+intel_8042_wait_read:
+.loop:
+  ; Read the 8042 status register.
+  in al, INTEL_8042_IN_STATUS
+  ; Output buffer status flag unset means output buffer is empty, so loop in this case.
+  test al, INTEL_8042_STATUS_MASK_OBUF
+  jz .loop
+  ret
diff --git a/stages/s3/s3.o b/stages/s3/s3.o
new file mode 100644
index 0000000..41a8674
Binary files /dev/null and b/stages/s3/s3.o differ
diff --git a/stages/s3/s3.s b/stages/s3/s3.s
new file mode 100644
index 0000000..7c039ec
--- /dev/null
+++ b/stages/s3/s3.s
@@ -0,0 +1,180 @@
+[bits 16]
+
+%include "fn.s"
+%include "layout.s"
+%include "s2_fns.s"
+
+extern test_a20
+extern enable_a20_intel_8042
+
+section .s3_text
+
+s3_main:
+  call test_a20
+  test al, al
+  jnz .a20_enabled
+
+  ; Try to enable A20 using the Intel 8042 PS/2 keyboard controller.
+  call enable_a20_intel_8042
+  call test_a20
+  test al, al
+  jnz .a20_enabled
+
+  ; TODO: try other methods first before we panic:
+  ; - [ ] BIOS interrupt
+  ; - [ ] Fast A20 enable
+  jmp panic_simple
+
+.a20_enabled:
+  mov ax, 0x0003
+  int 0x10
+
+  ; Disable cursor
+  mov ax, 0x0100
+  mov cx, 0x3f00
+  int 0x10
+
+  ; Ensure interrupts are definitely disabled.
+  cli
+
+  ; Load our flat-address-space GDT.
+  lgdt [gdt_flat_slice]
+
+  ; Set the protected-mode bit in cr0.
+  mov eax, cr0
+  or al, 0x01
+  mov cr0, eax
+
+  ; Long jump to set the code segment to gdt_flat.segment_code, and to clear the instruction
+  ; pipeline.
+  jmp GDT_FLAT_IDX_CODE_32:.protected_mode_32
+
+[bits 32]
+.protected_mode_32:
+
+  ; Set the data segments to gdt_flat.segment_data.
+  mov eax, GDT_FLAT_IDX_DATA
+  mov ds, eax
+  mov es, eax
+  mov fs, eax
+  mov gs, eax
+  mov ss, eax
+
+  ; Reset the stack.
+  ; TODO: put the 32-bit stack somewhere else.
+  mov ebp, REAL_STACK_BASE
+  mov esp, ebp
+
+  ; TODO
+  ; jmp _start
+  
+.halt:
+  hlt
+  jmp .halt
+
+; panic_simple_32:
+;   mov word [0xb8000], 0x4f21
+; .halt:
+;   hlt
+;   jmp .halt
+
+global s3_main
+
+
+section .s3_data
+
+gdt_flat_slice:
+  dw GDT_FLAT_LEN
+  dd gdt_flat
+
+global gdt_flat_slice
+
+; Segment descriptor layout
+; | Range (bits) | Field         |
+; |--------------|---------------|
+; |         0-16 | limit         |
+; |        16-32 | base          |
+; |        32-40 | base cont.    |
+; |        40-48 | access        |
+; |        48-52 | limit cont.   |
+; |        52-56 | flags         |
+; |        56-64 | base cont.    |
+;
+; Flags
+; - 0: reserved
+; - 1: long-mode code segment
+; - 2: size
+;   - unset: 16-bit
+;   - set: 32-bit
+; - 3: granularity
+;   - unset: limit is measured in bytes
+;   - set: limit is measured in 4KiB pages
+;
+; Access
+; - 0: accessed
+;   - unset: CPU will set it when the segment is accessed
+; - 1: readable / writable
+;   - data segments: is segment writable (data segments are always readable)
+;   - code segments: is segment readable (code segments are never writable)
+; - 2: direction / conforming
+;   - data segments: whether segment grows down
+;   - code segments: whether this can be executed from a lower-privilege ring
+; - 3: executable
+;   - unset: this is a data segment
+;   - set: this is a code segment
+; - 4: descriptor type
+;   - unset: this is a task state segment
+;   - set: this is a data or code segment
+; - 5-6: privilege level (ring number)
+; - 7: present (must be set)
+;
+
+; FIXME: copy this to a fixed memory location
+align 8
+gdt_flat:
+  ; First GDT entry must be 0.
+  dq 0
+
+; 32-bit code segment.
+; Bytes 0x0000 - 0xffff.
+.segment_code_32:
+  db 0xff, 0xff, \
+     0x00, 0x00, \
+     0x00,       \
+     10011011b,  \
+     01000000b,  \
+     0x00
+
+; 16-bit code segment, to use if we want to switch back to real mode.
+; Bytes 0x0000 - 0xffff.
+.segment_code_16:
+  db 0xff, 0xff, \
+     0x00, 0x00, \
+     0x00,       \
+     10011011b,  \
+     00000000b,  \
+     0x00
+
+; Data segment.
+; Pages 0x000000 - 0x0fffff, which covers the entire 32-bit address space (start of 0xfffff-th page
+; is 0xfffff * 4096 = 0xfffff000, end of page exclusive is 0xfffff000 + 4096 = 0x100000000).
+.segment_data:
+  db 0xff, 0xff, \
+     0x00, 0x00, \
+     0x00,       \
+     10010011b,  \
+     11001111b,  \
+     0x00
+
+global gdt_flat
+
+GDT_FLAT_LEN equ ($ - gdt_flat)
+
+GDT_FLAT_IDX_CODE_32 equ (gdt_flat.segment_code_32 - gdt_flat)
+global GDT_FLAT_IDX_CODE_32
+
+GDT_FLAT_IDX_CODE_16 equ (gdt_flat.segment_code_16 - gdt_flat)
+global GDT_FLAT_IDX_CODE_16
+
+GDT_FLAT_IDX_DATA equ (gdt_flat.segment_data - gdt_flat)
+global GDT_FLAT_IDX_DATA