Eigenstate: myrddin-dev mailing list

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH] Add thread-local storage for POSIX-y platforms.


This patch implements thread-local storage by using a segment register
to store pointers to regions of memory unique to each thread. In spawned
threads, this region starts above the top of the stack, while in the
main thread, this region is statically allocated at first so it can be
used in `__init__` functions and dynamically reallocated if it needs to
grow beyond 8 slots. The api consists of 3 functions: `gettlskey()`,
which must be called from the main thread, reserves a slot and returns a
key to be used with `settls(k, v)` and `gettls(k)`. (One concern with
the latter two functions is that they effectively take and return
untyped pointers, allowing the user to freely shoot themselves in the
foot.) Each thread inherits its tls slots from the thread that spawned
it and any slot added after a given thread is spawned is not available
in that thread.

Adding tls regions gives threads an easy way to get their own tids,
allowing us to add some basic correctness checks to the mutex code. A
pointer to the base of the stack and the size of the mapping are also
stored in the tls region, making it easy to support user-specified stack
sizes in the future.

Changes from previous version:
- Fixed size of main tls static allocation.
- More comments, less magic.

---
 bld.tags                                   |  4 ++
 lib/sys/sys+freebsd-x64.myr                | 12 +++-
 lib/sys/sys+linux-x64.myr                  | 16 ++++-
 lib/sys/sys+netbsd-x64.myr                 | 23 +++++++
 lib/sys/sys+openbsd-x64.myr                |  2 +-
 lib/sys/sys+openbsd:6.1-x64.myr            |  4 +-
 lib/sys/sys+openbsd:6.2-x64.myr            |  6 +-
 lib/sys/sys+openbsd:6.3-x64.myr            |  4 +-
 lib/thread/bld.sub                         | 12 ++++
 lib/thread/common.myr                      |  4 +-
 lib/thread/exit+freebsd-x64.s              | 11 +--
 lib/thread/exit+linux-x64.s                | 13 +---
 lib/thread/exit+openbsd-x64.s              | 11 +--
 lib/thread/fsbase+freebsd.myr              | 28 ++++++++
 lib/thread/fsbase+linux.myr                | 28 ++++++++
 lib/thread/fsbase+netbsd.myr               | 28 ++++++++
 lib/thread/fsbase+openbsd.myr              | 16 +++++
 lib/thread/mutex+futex.myr                 | 43 +++++++++++-
 lib/thread/mutex.myr                       |  1 -
 lib/thread/rwlock+futex.myr                | 27 +++++++-
 lib/thread/spawn+freebsd.myr               | 78 ++++++++++++----------
 lib/thread/spawn+linux.myr                 | 63 ++++++++---------
 lib/thread/spawn+openbsd.myr               | 60 +++++++++--------
 lib/thread/spawn+osx.myr                   | 68 +++++++++----------
 lib/thread/start+osx-x64.s                 | 11 +--
 lib/thread/test/die.myr                    |  8 +++
 lib/thread/test/tls.myr                    | 35 ++++++++++
 lib/thread/tls+fsbase.myr                  | 59 ++++++++++++++++
 lib/thread/tls+osx.myr                     | 70 +++++++++++++++++++
 lib/thread/tls-impl+fsbase-x64.s           | 48 +++++++++++++
 lib/thread/tls-impl+osx-x64.s              | 64 ++++++++++++++++++
 lib/thread/types+fsbase.myr                | 19 ++++++
 lib/thread/types+osx.myr                   | 20 ++++++
 mk/bootstrap/bootstrap+Darwin-x86_64.sh    | 11 +--
 mk/bootstrap/bootstrap+FreeBSD-amd64.sh    | 12 ++--
 mk/bootstrap/bootstrap+Linux-x86_64.sh     | 12 ++--
 mk/bootstrap/bootstrap+NetBSD-amd64.sh     | 12 ++--
 mk/bootstrap/bootstrap+OpenBSD-amd64.sh    | 12 ++--
 rt/start-freebsd.s                         | 15 +++++
 rt/start-linux.s                           | 11 +++
 rt/start-netbsd.s                          | 15 +++++
 rt/start-openbsd.s                         | 10 +++
 rt/start-osx.s                             | 11 +++
 support/syscall-gen/types+freebsd-x64.frag |  8 +++
 support/syscall-gen/types+linux-x64.frag   |  7 ++
 45 files changed, 820 insertions(+), 212 deletions(-)
 create mode 100644 lib/thread/fsbase+freebsd.myr
 create mode 100644 lib/thread/fsbase+linux.myr
 create mode 100644 lib/thread/fsbase+netbsd.myr
 create mode 100644 lib/thread/fsbase+openbsd.myr
 create mode 100644 lib/thread/test/die.myr
 create mode 100644 lib/thread/test/tls.myr
 create mode 100644 lib/thread/tls+fsbase.myr
 create mode 100644 lib/thread/tls+osx.myr
 create mode 100644 lib/thread/tls-impl+fsbase-x64.s
 create mode 100644 lib/thread/tls-impl+osx-x64.s
 create mode 100644 lib/thread/types+fsbase.myr
 create mode 100644 lib/thread/types+osx.myr

diff --git a/bld.tags b/bld.tags
index ce47dd11..c6eae4a7 100644
--- a/bld.tags
+++ b/bld.tags
@@ -1,3 +1,7 @@
+fsbase: freebsd
+fsbase: linux
+fsbase: netbsd
+fsbase: openbsd
 futex: freebsd
 futex: linux
 futex: openbsd:6.2
diff --git a/lib/sys/sys+freebsd-x64.myr b/lib/sys/sys+freebsd-x64.myr
index b89e99b9..73821b4f 100644
--- a/lib/sys/sys+freebsd-x64.myr
+++ b/lib/sys/sys+freebsd-x64.myr
@@ -38,6 +38,7 @@ pkg sys =
 	type cpulevel	= int
 	type cpusetid	= int
 	type idtype	= int
+	type sysarchop	= int64
 	
 	type acltype	= int
 	type acltag	= uint32
@@ -802,6 +803,13 @@ pkg sys =
 	const Siglwp	: signo = Sigthr
 	const Siglibrt	: signo = 33	/* reserved by real-time library. */
 	
+	/* sysarch ops */
+	const Archamd64getfs   : sysarchop = 128
+	const Archamd64setfs   : sysarchop = 129
+	const Archamd64getgs   : sysarchop = 130
+	const Archamd64setgs   : sysarchop = 131
+	const Archamd64getxfpu : sysarchop = 132
+	
 	extern const syscall : (sc:scno, args:... -> int64)
 	extern var __cenvp : byte##
 	
@@ -1285,7 +1293,7 @@ pkg sys =
 	const quotactl			:  (path : byte#, cmd : int, uid : int, arg : void# -> int)
 	const lgetfh			:  (fname : byte#, fhp : fhandle# -> int)
 	const getfh			:  (fname : byte#, fhp : fhandle# -> int)
-	const sysarch			:  (op : int, parms : byte# -> int)
+	const sysarch			:  (op : sysarchop, parms : void## -> int)
 	const rtprio			:  (function : int, pid : pid, rtp : rtprio# -> int)
 	const setfib			:  (fibnum : int -> int)
 	const ntp_adjtime		:  (tp : timex# -> int)
@@ -1969,7 +1977,7 @@ const getfh	= {fname, fhp
 	 -> (syscall(Sysgetfh, a(fname), a(fhp)) : int)
 }
 const sysarch	= {op, parms
-	 -> (syscall(Syssysarch, a(op), a(parms)) : int)
+	 -> (syscall(Syssysarch, op, a(parms)) : int)
 }
 const rtprio	= {function, pid, rtp
 	 -> (syscall(Sysrtprio, a(function), a(pid), a(rtp)) : int)
diff --git a/lib/sys/sys+linux-x64.myr b/lib/sys/sys+linux-x64.myr
index 61bb8b2d..bbfb6897 100644
--- a/lib/sys/sys+linux-x64.myr
+++ b/lib/sys/sys+linux-x64.myr
@@ -45,6 +45,7 @@ pkg sys =
 	type mfdflags	= uint32
 	type aiocontext	= uint64
 	type msg	= void#
+	type arch_prctlop	= uint64
 	
 	
 	type clock = union
@@ -590,6 +591,12 @@ pkg sys =
 	
 	/* return value for a failed mapping */
 	const Mapbad	: byte# = (-1 : byte#)
+
+	/* arch_prctl ops */
+	const Archsetgs : arch_prctlop = 0x1001
+	const Archsetfs : arch_prctlop = 0x1002
+	const Archgetfs : arch_prctlop = 0x1003
+	const Archgetgs : arch_prctlop = 0x1004
 	
 	/* signal flags */
 	const Sanocldstop	: sigflags = 0x00000001
@@ -1097,6 +1104,7 @@ pkg sys =
 	const Sysmq_notify		: scno = 244
 	const Sysmq_getsetattr		: scno = 245
 	const Sysprctl			: scno = 157
+	const Sysarch_prctl		: scno = 158
 	const Sysswapon			: scno = 167
 	const Sysswapoff		: scno = 168
 	const Sys_sysctl		: scno = 156
@@ -1308,7 +1316,7 @@ pkg sys =
 	const settimeofday		:  (tv : timeval#, tz : timezone# -> int64)
 	const adjtimex			:  (txc_p : timex# -> int64)
 	const times			:  (tbuf : tms# -> int64)
-	const gettid			:  ( -> int64)
+	const gettid			:  ( -> pid)
 	const alarm			:  (seconds : uint -> int64)
 	const getppid			:  ( -> int64)
 	const geteuid			:  ( -> int64)
@@ -1484,6 +1492,7 @@ pkg sys =
 	const mq_notify			:  (mqdes : int, notification : sigevent# -> int64)
 	const mq_getsetattr		:  (mqdes : int, mqstat : mq_attr#, omqstat : mq_attr# -> int64)
 	const prctl			:  (option : int, arg2 : uint64, arg3 : uint64, arg4 : uint64, arg5 : uint64 -> int64)
+	const arch_prctl		:  (op : arch_prctlop, addr : void# -> int64)
 	const swapon			:  (specialfile : byte#, swap_flags : int -> int64)
 	const swapoff			:  (specialfile : byte# -> int64)
 	const _sysctl			:  (args : sysctl_args# -> int64)
@@ -1782,7 +1791,7 @@ const times	= {tbuf
 	 -> (syscall(Systimes, a(tbuf)) : int64)
 }
 const gettid	= {
-	 -> (syscall(Sysgettid) : int64)
+	 -> (syscall(Sysgettid) : pid)
 }
 const alarm	= {seconds
 	 -> (syscall(Sysalarm, a(seconds)) : int64)
@@ -2309,6 +2318,9 @@ const mq_getsetattr	= {mqdes, mqstat, omqstat
 const prctl	= {option, arg2, arg3, arg4, arg5
 	 -> (syscall(Sysprctl, a(option), a(arg2), a(arg3), a(arg4), a(arg5)) : int64)
 }
+const arch_prctl	= {op, addr
+	 -> syscall(Sysarch_prctl, op, addr)
+}
 const swapon	= {specialfile, swap_flags
 	 -> (syscall(Sysswapon, a(specialfile), a(swap_flags)) : int64)
 }
diff --git a/lib/sys/sys+netbsd-x64.myr b/lib/sys/sys+netbsd-x64.myr
index 8e22c0b7..5fd3bb04 100644
--- a/lib/sys/sys+netbsd-x64.myr
+++ b/lib/sys/sys+netbsd-x64.myr
@@ -18,6 +18,7 @@ pkg sys =
 	type umtxop	= int32
 	type signo	= int32
 	type sigflags	= int32
+	type sysarchop	= int64
 
 	type clock = union
 		`Clockrealtime
@@ -344,6 +345,21 @@ pkg sys =
 	const Umtxmtxwake2	: umtxop = 22
 	const Umtxmax	: umtxop = 23
 
+	/* sysarch ops */
+	const X8664getldt    : sysarchop = 0
+	const X8664setldt    : sysarchop = 1
+	const X8664iopl      : sysarchop = 2
+	const X8664getioperm : sysarchop = 3
+	const X8664setioperm : sysarchop = 4
+	const X8664oldvm86   : sysarchop = 5
+	const X8664getmtrr   : sysarchop = 11
+	const X8664setmtrr   : sysarchop = 12
+	const X8664vm86      : sysarchop = 13
+	const X8664getgsbase : sysarchop = 14
+	const X8664getfsbase : sysarchop = 15
+	const X8664setgsbase : sysarchop = 16
+	const X8664setfsbase : sysarchop = 17
+
 	/* signal actions */
 	const Saonstack		: sigflags = 0x0001	/* take signal on signal stack */
 	const Sarestart		: sigflags = 0x0002	/* restart system call on signal return */
@@ -908,6 +924,9 @@ pkg sys =
 		new : void#, newsz : size# \
 		-> int)
 
+	/* misc */
+	const sysarch	: (op : sysarchop, args : void## -> int)
+
 	extern const cstring	: (str : byte[:] -> byte#)
 	/* filled by start code */
 	extern var __cenvp : byte##
@@ -1102,6 +1121,10 @@ const sysctl = {mib, old, oldsz, new, newsz
 		(mib : int#), a(mib.len), old, oldsz, new, newsz) : int)
 }
 
+const sysarch = {op, args
+	-> (syscall(Syssysarch, op, args) : int)
+}
+
 const clockid = {clk
 	match clk
 	| `Clockrealtime:	-> 0
diff --git a/lib/sys/sys+openbsd-x64.myr b/lib/sys/sys+openbsd-x64.myr
index a2601344..d840608f 100644
--- a/lib/sys/sys+openbsd-x64.myr
+++ b/lib/sys/sys+openbsd-x64.myr
@@ -215,7 +215,7 @@ pkg sys =
 	const Mfixed	: mopt = 0x10
 	const Mfile	: mopt = 0x0
 	const Manon	: mopt = 0x1000
-	const Mstack	: mopt = 0x4000
+	const Mstack	: mopt = 0x0
 	const Mnoreplace	: mopt = 0x0800
 
 	/* file types */
diff --git a/lib/sys/sys+openbsd:6.1-x64.myr b/lib/sys/sys+openbsd:6.1-x64.myr
index 93143945..9ac648a2 100644
--- a/lib/sys/sys+openbsd:6.1-x64.myr
+++ b/lib/sys/sys+openbsd:6.1-x64.myr
@@ -1031,7 +1031,7 @@ pkg sys =
 	const symlinkat			:  (path : byte#, fd : int, link : byte# -> int)
 	const unlinkat			:  (fd : int, path : byte#, flag : int -> int)
 	const __set_tcb			:  (tcb : void# -> void)
-	const __get_tcb			:  ( -> void)
+	const __get_tcb			:  ( -> void#)
 ;;
 
 	/* start manual overrides { */
@@ -1750,5 +1750,5 @@ const __set_tcb	= {tcb
 	 -> (syscall(Sys__set_tcb, a(tcb)) : void)
 }
 const __get_tcb	= {
-	 -> (syscall(Sys__get_tcb) : void)
+	 -> (syscall(Sys__get_tcb) : void#)
 }
diff --git a/lib/sys/sys+openbsd:6.2-x64.myr b/lib/sys/sys+openbsd:6.2-x64.myr
index 598d3817..d89b7023 100644
--- a/lib/sys/sys+openbsd:6.2-x64.myr
+++ b/lib/sys/sys+openbsd:6.2-x64.myr
@@ -348,7 +348,7 @@ pkg sys =
 	const Mfixed	: mopt = 0x10
 	const Mfile	: mopt = 0x0
 	const Manon	: mopt = 0x1000
-	const Mstack	: mopt = 0x4000
+	const Mstack	: mopt = 0x0
 	const Mnoreplace	: mopt = 0x0800
 	
 	/* file types */
@@ -1037,7 +1037,7 @@ pkg sys =
 	const symlinkat			:  (path : byte#, fd : int, link : byte# -> int)
 	const unlinkat			:  (fd : int, path : byte#, flag : int -> int)
 	const __set_tcb			:  (tcb : void# -> void)
-	const __get_tcb			:  ( -> void)
+	const __get_tcb			:  ( -> void#)
 ;;
 
 	/* start manual overrides { */
@@ -1759,5 +1759,5 @@ const __set_tcb	= {tcb
 	 -> (syscall(Sys__set_tcb, a(tcb)) : void)
 }
 const __get_tcb	= {
-	 -> (syscall(Sys__get_tcb) : void)
+	 -> (syscall(Sys__get_tcb) : void#)
 }
diff --git a/lib/sys/sys+openbsd:6.3-x64.myr b/lib/sys/sys+openbsd:6.3-x64.myr
index d013643e..ac7170b2 100644
--- a/lib/sys/sys+openbsd:6.3-x64.myr
+++ b/lib/sys/sys+openbsd:6.3-x64.myr
@@ -1036,7 +1036,7 @@ pkg sys =
 	const symlinkat			:  (path : byte#, fd : int, link : byte# -> int)
 	const unlinkat			:  (fd : int, path : byte#, flag : int -> int)
 	const __set_tcb			:  (tcb : void# -> void)
-	const __get_tcb			:  ( -> void)
+	const __get_tcb			:  ( -> void#)
 ;;
 
 	/* start manual overrides { */
@@ -1755,5 +1755,5 @@ const __set_tcb	= {tcb
 	 -> (syscall(Sys__set_tcb, a(tcb)) : void)
 }
 const __get_tcb	= {
-	 -> (syscall(Sys__get_tcb) : void)
+	 -> (syscall(Sys__get_tcb) : void#)
 }
diff --git a/lib/thread/bld.sub b/lib/thread/bld.sub
index c3800918..57aed13c 100644
--- a/lib/thread/bld.sub
+++ b/lib/thread/bld.sub
@@ -14,6 +14,11 @@ lib thread =
 	sem.myr
 	waitgrp.myr
 
+	# fsbase-based impls
+	tls+fsbase.myr
+	tls-impl+fsbase-x64.s
+	types+fsbase.myr
+
 	# futex-based impls
 	mutex+futex.myr
 	rwlock+futex.myr
@@ -23,6 +28,7 @@ lib thread =
 	# linux impl of basic thread primitives
 	condvar+linux.myr
 	exit+linux-x64.s
+	fsbase+linux.myr
 	futex+linux.myr
 	ncpu+linux.myr
 	spawn+linux.myr
@@ -30,6 +36,7 @@ lib thread =
 	# freebsd impl of thread primitives
 	condvar+freebsd.myr
 	exit+freebsd-x64.s
+	fsbase+freebsd.myr
 	futex+freebsd.myr
 	ncpu+freebsd.myr
 	spawn+freebsd.myr
@@ -37,6 +44,7 @@ lib thread =
 	# netbsd impl of thread primitives
 	#condvar+netbsd.myr
 	#mutex+netbsd.myr
+	fsbase+netbsd.myr
 	spawn+netbsd.myr
 	#ncpu+netbsd.myr
 	#exit+netbsd-x64.s
@@ -46,6 +54,9 @@ lib thread =
 	futex+osx.myr
 	spawn+osx.myr
 	start+osx-x64.s
+	tls+osx.myr
+	tls-impl+osx-x64.s
+	types+osx.myr
 
 	# 9front impl of thread primitives
 	#condvar+plan9.myr
@@ -58,6 +69,7 @@ lib thread =
 	# openbsd impl of thread primitives
 	condvar+openbsd:6.2.myr
 	exit+openbsd-x64.s
+	fsbase+openbsd.myr
 	futex+openbsd:6.2.myr
 	ncpu+openbsd.myr
 	spawn+openbsd.myr
diff --git a/lib/thread/common.myr b/lib/thread/common.myr
index 66fc9d56..3e4f1f5c 100644
--- a/lib/thread/common.myr
+++ b/lib/thread/common.myr
@@ -1,5 +1,3 @@
-use std
-
-pkg thread = 
+pkg thread =
 	pkglocal generic Zptr : @a#  = (0 : @a#)
 ;;
diff --git a/lib/thread/exit+freebsd-x64.s b/lib/thread/exit+freebsd-x64.s
index d8952b6d..5279bf1c 100644
--- a/lib/thread/exit+freebsd-x64.s
+++ b/lib/thread/exit+freebsd-x64.s
@@ -1,19 +1,12 @@
 /*
 const thread.exit	: (stacksz : std.size -> void)
-NOTE: must be called from the bottom of the stack, since
-we assume that %rbp is in the top 4k of the stack.
 */
 .globl thread$exit
 thread$exit:
-	/* find top of stack */
-	movq	%rbp,%rdi	/* addr */
-	andq	$~0xfff,%rdi	/* align it */
-	addq	$0x1000,%rdi
-
 	/* munmap(base, size) */
 	movq	$73,%rax	/* munmap */
-	movq	-8(%rdi),%rsi	/* size */
-	subq	%rsi,%rdi	/* move to base ptr */
+	subq	%fs:0x08,%rdi	/* base */
+	movq	%fs:0x10,%rsi	/* stksz */
 	syscall
 
 	/* thr_exit(null) */
diff --git a/lib/thread/exit+linux-x64.s b/lib/thread/exit+linux-x64.s
index a54e8026..8e438522 100644
--- a/lib/thread/exit+linux-x64.s
+++ b/lib/thread/exit+linux-x64.s
@@ -1,19 +1,12 @@
 /*
-const thread.exit	: (stacksz : std.size -> void)
-NOTE: must be called from the bottom of the stack, since
-we assume that %rbp is in the top 4k of the stack.
+const thread.exit : (-> void)
 */
 .globl thread$exit
 thread$exit:
-	/* find top of stack */
-	movq	%rbp,%rdi	/* addr */
-	andq	$~0xfff,%rdi	/* align it */
-	addq	$0x1000,%rdi
-
 	/* munmap(base, size) */
 	movq	$11,%rax	/* munmap */
-	movq	-8(%rdi),%rsi	/* size */
-	subq	%rsi,%rdi	/* move to base ptr */
+	subq	%fs:0x08,%rdi	/* base */
+	movq	%fs:0x10,%rsi	/* stksz */
 	syscall
 
 	/* thread_exit(0) */
diff --git a/lib/thread/exit+openbsd-x64.s b/lib/thread/exit+openbsd-x64.s
index 6421cc3a..84eb0c72 100644
--- a/lib/thread/exit+openbsd-x64.s
+++ b/lib/thread/exit+openbsd-x64.s
@@ -1,15 +1,8 @@
 /*
 const thread.exit	: (stacksz : std.size -> void)
-NOTE: must be called from the bottom of the stack, since
-we assume that %rbp is in the top 4k of the stack.
 */
 .globl thread$exit
 thread$exit:
-	/* find top of stack */
-	movq	%rbp,%rdi	/* addr */
-	andq	$~0xfff,%rdi	/* align it */
-	addq	$0x1000,%rdi
-
 	/* 
 	  Because OpenBSD wants a valid stack whenever
 	  we enter the kernel, we need to toss a preallocated
@@ -19,8 +12,8 @@ thread$exit:
 
 	/* munmap(base, size) */
 	movq	$73,%rax	/* munmap */
-	movq	-8(%rdi),%rsi	/* size */
-	subq	%rsi,%rdi	/* move to base ptr */
+	subq	%fs:0x08,%rdi	/* base */
+	movq	%fs:0x10,%rsi	/* stksz */
 	syscall
 
 	/* __threxit(0) */
diff --git a/lib/thread/fsbase+freebsd.myr b/lib/thread/fsbase+freebsd.myr
new file mode 100644
index 00000000..e648aba0
--- /dev/null
+++ b/lib/thread/fsbase+freebsd.myr
@@ -0,0 +1,28 @@
+use std
+use sys
+
+use "types"
+
+pkg thread =
+	pkglocal const setfsbase : (h : tlshdr# -> void)
+	pkglocal const getfsbase : (-> tlshdr#)
+;;
+
+const setfsbase = {h
+	match sys.sysarch(sys.Archamd64setfs, &(h : void#))
+	| 0:
+	| err:
+		std.fput(std.Err, "error: sysarch returned {}\n", err)
+		std.suicide()
+	;;
+}
+
+const getfsbase = {
+	var h
+	match sys.sysarch(sys.Archamd64getfs, &h)
+	| 0: -> (h : tlshdr#)
+	| err:
+		std.fput(std.Err, "error: sysarch returned {}\n", err)
+		std.suicide()
+	;;
+}
diff --git a/lib/thread/fsbase+linux.myr b/lib/thread/fsbase+linux.myr
new file mode 100644
index 00000000..1641a649
--- /dev/null
+++ b/lib/thread/fsbase+linux.myr
@@ -0,0 +1,28 @@
+use std
+use sys
+
+use "types"
+
+pkg thread =
+	pkglocal const setfsbase : (h : tlshdr# -> void)
+	pkglocal const getfsbase : (-> tlshdr#)
+;;
+
+const setfsbase = {h
+	match sys.arch_prctl(sys.Archsetfs, (h : void#))
+	| 0:
+	| err:
+		std.fput(std.Err, "error: arch_prctl returned {}\n", err)
+		std.suicide()
+	;;
+}
+
+const getfsbase = {
+	var h : tlshdr#
+	match sys.arch_prctl(sys.Archgetfs, (&h : void#))
+	| 0: -> h
+	| err:
+		std.fput(std.Err, "error: arch_prctl returned {}\n", err)
+		std.suicide()
+	;;
+}
diff --git a/lib/thread/fsbase+netbsd.myr b/lib/thread/fsbase+netbsd.myr
new file mode 100644
index 00000000..3c470cf1
--- /dev/null
+++ b/lib/thread/fsbase+netbsd.myr
@@ -0,0 +1,28 @@
+use std
+use sys
+
+use "types"
+
+pkg thread =
+	pkglocal const setfsbase : (h : tlshdr# -> void)
+	pkglocal const getfsbase : (-> tlshdr#)
+;;
+
+const setfsbase = {h
+	match sys.sysarch(sys.X8664setfsbase, &(h : void#))
+	| 0:
+	| err:
+		std.fput(std.Err, "error: sysarch returned: {}\n", err)
+		std.suicide()
+	;;
+}
+
+const getfsbase = {
+	var h
+	match sys.sysarch(sys.X8664getfsbase, &h)
+	| 0: -> (h : tlshdr#)
+	| err:
+		std.fput(std.Err, "error: sysarch returned: {}\n", err)
+		std.suicide()
+	;;
+}
diff --git a/lib/thread/fsbase+openbsd.myr b/lib/thread/fsbase+openbsd.myr
new file mode 100644
index 00000000..7ae92b27
--- /dev/null
+++ b/lib/thread/fsbase+openbsd.myr
@@ -0,0 +1,16 @@
+use sys
+
+use "types"
+
+pkg thread =
+	pkglocal const setfsbase : (h : tlshdr# -> void)
+	pkglocal const getfsbase : (-> tlshdr#)
+;;
+
+const setfsbase = {h
+	sys.__set_tcb((h : void#))
+}
+
+const getfsbase = {
+	-> (sys.__get_tcb() : tlshdr#)
+}
diff --git a/lib/thread/mutex+futex.myr b/lib/thread/mutex+futex.myr
index 50e8406d..bb9012d8 100644
--- a/lib/thread/mutex+futex.myr
+++ b/lib/thread/mutex+futex.myr
@@ -1,9 +1,14 @@
+use std
+
 use "atomic"
 use "futex"
+use "tls"
+use "types"
 
 pkg thread =
 	type mutex = struct
 		_state	: ftxtag
+		_owner	: tid
 	;;	
 
 	const mkmtx	: (-> mutex)
@@ -21,12 +26,19 @@ const Contended = 2
 var nspin = 10	/* FIXME: pick a sane number, based on CPU count */
 
 const mkmtx = {
-	-> [._state = Unlocked]
+	-> [._state = Unlocked, ._owner = -1]
 }
 
 const mtxlock = {mtx
 	var c
 
+	if mtx._owner == tid()
+		std.fput(std.Err,
+			"error: thread {} attempted to relock a mutex it already holds\n",
+			tid())
+		std.suicide()
+	;;
+
 	/*
 	Uncontended case: we get an unlocked mutex, and we lock it.
 	*/
@@ -34,6 +46,7 @@ const mtxlock = {mtx
 	for var i = 0; i < nspin; i++
 		c = xcas(&mtx._state, Unlocked, Locked)
 		if c == Unlocked
+			mtx._owner = tid()
 			-> void
 		;;
 	;;
@@ -51,13 +64,31 @@ const mtxlock = {mtx
 		ftxwait(&mtx._state, Contended, -1)
 		c = xchg(&mtx._state, Contended)
 	;;
+	mtx._owner = tid()
 }
 
 const mtxtrylock = {mtx
-	-> xcas(&mtx._state, Unlocked, Locked) == Unlocked
+	if xcas(&mtx._state, Unlocked, Locked) == Unlocked
+		mtx._owner = tid()
+		-> true
+	;;
+	-> false
 }
 
 const mtxunlock = {mtx
+	/*
+	Nonatomically loading mtx._owner may produce false negatives on
+	weakly-ordered architectures but having to atomically store and load
+	mtx._owner doesn't seem worth it.
+	*/
+	if mtx._owner != tid()
+		std.fput(std.Err,
+			"error: thread {} attempted to unlock a mutex last held by {}\n",
+			tid(), mtx._owner)
+		std.suicide()
+	;;
+	mtx._owner = -1
+
 	/*
 	Either the lock is contended or it's uncontended. Any other
 	state is a bug.
@@ -72,7 +103,15 @@ const mtxunlock = {mtx
 }
 
 const mtxcontended = {mtx
+	if mtx._owner == tid()
+		std.fput(std.Err,
+			"error: thread {} attempted to relock a mutex it already holds\n",
+			tid())
+		std.suicide()
+	;;
+
 	while xchg(&mtx._state, Contended) != Unlocked
 		ftxwait(&mtx._state, Contended, -1)
 	;;
+	mtx._owner = tid()
 }
diff --git a/lib/thread/mutex.myr b/lib/thread/mutex.myr
index b37f2fb3..100ab450 100644
--- a/lib/thread/mutex.myr
+++ b/lib/thread/mutex.myr
@@ -1,5 +1,4 @@
 use std
-use sys
 
 use "atomic"
 
diff --git a/lib/thread/rwlock+futex.myr b/lib/thread/rwlock+futex.myr
index 4975c953..a1174405 100644
--- a/lib/thread/rwlock+futex.myr
+++ b/lib/thread/rwlock+futex.myr
@@ -2,6 +2,8 @@ use std
 
 use "atomic"
 use "futex"
+use "tls"
+use "types"
 
 pkg thread =
 	/*
@@ -13,6 +15,7 @@ pkg thread =
 	*/
 	type rwlock = struct
 		_state : ftxtag
+		_owner : tid
 	;;
 
 	const mkrwlock  : (-> rwlock)
@@ -28,7 +31,7 @@ const Nrmask  = 0x7fffffff
 const Waitbit = 0x80000000
 
 const mkrwlock = {
-	-> [._state = 0]
+	-> [._state = 0, ._owner = -1]
 }
 
 const rdlock = {rw
@@ -61,6 +64,13 @@ const rdlock = {rw
 
 const wrlock = {rw
 	for ; ;
+		if rw._owner == tid()
+			std.fput(std.Err,
+				"error: thread {} attempted to relock an rwlock it already holds\n",
+				tid())
+			std.suicide()
+		;;
+
 		/*
 		`_state` must be 0 for a writer to acquire the lock. Anything
 		else means the lock is either held or in the process of being
@@ -68,6 +78,7 @@ const wrlock = {rw
 		 */
 		var s = xcas(&rw._state, 0, Nrmask)
 		if s == 0
+			rw._owner = tid()
 			-> void
 		;;
 
@@ -98,7 +109,11 @@ const tryrdlock = {rw
 }
 
 const trywrlock = {rw
-	-> xcas(&rw._state, 0, Nrmask) == 0
+	if xcas(&rw._state, 0, Nrmask) == 0
+		rw._owner = tid()
+		-> true
+	;;
+	-> false
 }
 
 const rdunlock = {rw
@@ -122,6 +137,14 @@ const rdunlock = {rw
 }
 
 const wrunlock = {rw
+	if rw._owner != tid()
+		std.fput(std.Err,
+			"error: thread {} attempted to unlock an rwlock last held by {}\n",
+			tid(), rw._owner)
+		std.suicide()
+	;;
+	rw._owner = -1
+
 	/*
 	If the wait bit was set then there are one or more waiting readers,
 	writers, or both. In the first and third cases, we need to wake
diff --git a/lib/thread/spawn+freebsd.myr b/lib/thread/spawn+freebsd.myr
index cdc76732..66a28bdc 100644
--- a/lib/thread/spawn+freebsd.myr
+++ b/lib/thread/spawn+freebsd.myr
@@ -1,9 +1,12 @@
 use sys
 use std
 
-pkg thread =
-	type tid = uint64
+use "common"
+use "fsbase"
+use "tls"
+use "types"
 
+pkg thread =
 	const spawn : (fn : (-> void) -> std.result(tid, byte[:]))
 ;;
 
@@ -16,60 +19,63 @@ const spawn = {fn
 }
 
 const spawnstk = {fn, sz
-	var stk : byte#, tid, ctid, ret
-	var szp, f, tos, env, envsz
+	var stk, tos, stksz, hdr, tid = -1, ret
 
-	stk = getstk(sz)
+	stk = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
 	if stk == sys.Mapbad
 		-> `std.Err "couldn't get stack"
 	;;
-	tid = -1
-	/* find top of stack */
-	tos = (stk : std.intptr) + (sz : std.intptr)
-
-	/* store the stack size */
-	tos -= sizeof(sys.size)
-	sz -= sizeof(sys.size)
-	szp = (tos : sys.size#)
-	szp# = Stacksz
-
-	/* store the function we call */
-	envsz = std.fnenvsz(fn)
-	tos -= (envsz : std.intptr)
-	sz -= (envsz : sys.size)
-	env = tos
-	tos -= sizeof((->void))
-	sz -= sizeof((->void))
-	f = (tos : (->void)#)
-	f# = std.fnbdup(fn, (env : byte#)[:envsz])
-	var repr = (&fn : int64[2]#)#
+	(tos, stksz, hdr) = initstk(stk, fn, sz)
 
 	ret = sys.thr_new(&[
 		.startfn = (startthread : void#),
 		.arg = (tos : void#),
 		.stkbase = (stk : byte#),
-		.stksz = sz,
-		.tid = &ctid,
+		.stksz = stksz,
+		.tid = (&hdr.tid : uint64#),
 		.ptid = &tid,
 		.flags = 2,
-		.rtp = (0 : sys.rtprio#)
+		.rtp = Zptr,
 	], sizeof(sys.thrparam))
 
 	if ret < 0
+		sys.munmap(stk, sz)
 		-> `std.Err "couldn't spawn thread"
 	;;
 	-> `std.Ok (tid : tid)
 }
 
-const getstk = {sz
-	var p, m
+const initstk = {stk, fn, sz
+	var stksz, len, tos, hdr, fp, env, envsz
 
-	p = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
-	if p == sys.Mapbad
-		-> p
-	;;
-	m = (p : std.intptr)
-	-> (m : byte#)
+	stksz = sz
+	len = tlslen()
+	stksz -= (sizeof(tlshdr) + ((len : sys.size) * sizeof(void#)) + 0xf) & ~0xf
+	tos = (stk : std.intptr) + (stksz : std.intptr)
+	hdr = (tos : tlshdr#)
+	hdr.base = stk
+	hdr.stksz = sz
+
+	var fn1 = {
+		/*
+		We write `hdr.len` here because it follows `hdr.tid` so it gets
+		overwritten by the kernel in `thr_new`. Even though `sys.pid`
+		is 32 bits, `thr_param.tid` is a `uint64#` for legacy reasons.
+		*/
+		hdr.len = len
+		setfsbase(hdr)
+		fn()
+	}
+
+	envsz = std.fnenvsz(fn1)
+	tos -= (envsz : std.intptr)
+	stksz -= (envsz : sys.size)
+	env = tos
+	tos -= sizeof((->void))
+	stksz -= sizeof((->void))
+	fp = (tos : (->void)#)
+	fp# = std.fnbdup(fn1, (env : byte#)[:envsz])
+	-> ((tos : byte#), stksz, hdr)
 }
 
 const startthread = {f : (-> void)#
diff --git a/lib/thread/spawn+linux.myr b/lib/thread/spawn+linux.myr
index a56317f7..d56ae478 100644
--- a/lib/thread/spawn+linux.myr
+++ b/lib/thread/spawn+linux.myr
@@ -1,72 +1,67 @@
 use sys
 use std
 
-pkg thread =
-	type tid = sys.pid
+use "common"
+use "tls"
+use "types"
 
+pkg thread =
 	const spawn : (fn : (-> void) -> std.result(tid, byte[:]))
 ;;
 
+const Stacksz = 8*std.MiB
 extern const exit : (-> void)
 
 /* Holy shit flag mania. */
-const Thrflag = sys.Clonevm | sys.Clonefs | sys.Clonefiles  | \
-	sys.Clonesighand | sys.Clonethread |sys.Clonesysvsem | \
-	sys.Clonesettls | sys.Cloneparentsettid | sys.Clonechildcleartid
-
-const Stacksz = 8*std.MiB
+const Thrflag = sys.Clonevm | sys.Clonefs | sys.Clonefiles | \
+	sys.Clonesighand | sys.Clonethread | sys.Clonesettls | \
+	sys.Clonechildsettid
 
 const spawn = {fn
 	-> spawnstk(fn, Stacksz)
 }
 
 const spawnstk = {fn, sz
-	var stk : byte#, tid, ctid, ret
+	var stk, tos, hdr, ret
 
-	stk = getstk(sz)
+	stk = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
 	if stk == sys.Mapbad
 		-> `std.Err "couldn't get stack"
 	;;
-	stk = initstack(stk, fn, Stacksz)
+	(tos, hdr) = initstk(stk, fn, sz)
 
-	ret = sys.fnclone(Thrflag, \
-		(stk : byte#),\
-		&tid, (0 : byte#), \
-		&ctid, (0 : byte#), \
+	ret = sys.fnclone(Thrflag,
+		tos,
+		Zptr,
+		(hdr : byte#),
+		(&hdr.tid : sys.pid#),
+		Zptr,
 		(startthread : void#))
 	if ret < 0
+		sys.munmap(stk, sz)
 		-> `std.Err "couldn't spawn thread"
 	;;
 	-> `std.Ok (ret : tid)
 }
 
-const initstack = {stk, fn, sz
-	var tos, szp, fp, env, envsz
+const initstk = {stk, fn, sz
+	var len, tos, hdr, fp, env, envsz
+
+	len = tlslen()
+	tos = (stk : std.intptr) + (sz : std.intptr)
+	tos -= (sizeof(tlshdr) + ((len : std.intptr) * sizeof(void#)) + 0xf) & ~0xf
+	hdr = (tos : tlshdr#)
+	hdr.len = len
+	hdr.base = stk
+	hdr.stksz = sz
 
 	envsz = std.fnenvsz(fn)
-	tos = (stk : std.intptr)
-	tos -= sizeof(int64)
-	szp = (tos : sys.size#)
-	szp# = sz
 	tos -= (envsz : std.intptr)
 	env = tos
 	tos -= sizeof((->void))
 	fp = (tos : (->void)#)
 	fp# = std.fnbdup(fn, (env : byte#)[:envsz])
-	-> (tos : byte#)
-}
-
-const getstk = {sz
-	var p, m
-
-	p = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
-	if p == sys.Mapbad
-		-> p
-	;;
-	/* stack starts at the top of memory and grows down. */
-	m = (p : std.intptr)
-	m += (sz : std.intptr)
-	-> (m : byte#)
+	-> ((tos : byte#), hdr)
 }
 
 const startthread = {fn : (-> void)
diff --git a/lib/thread/spawn+openbsd.myr b/lib/thread/spawn+openbsd.myr
index 4526520a..c63b51b8 100644
--- a/lib/thread/spawn+openbsd.myr
+++ b/lib/thread/spawn+openbsd.myr
@@ -1,9 +1,11 @@
 use std
 use sys
 
-pkg thread =
-	type tid = uint64
+use "common"
+use "tls"
+use "types"
 
+pkg thread =
 	const spawn : (fn : (-> void) -> std.result(tid, byte[:]))
 	pkglocal var exitstk : byte#
 ;;
@@ -18,6 +20,7 @@ const __init__ = {
 	  time to swap to before we invalidate a stack.
 	 */
 	exitstk = getstk(16)
+	std.assert(exitstk != sys.Mapbad, "error: failed to mmap exitstk\n")
 }
 
 const spawn = {fn;
@@ -25,30 +28,17 @@ const spawn = {fn;
 }
 
 const spawnstk = {fn, sz
-	var stk, szp, fp, tos, tfp, env, envsz
-	var ret
+	var stk, tos, hdr, tfp, ret
 
 	stk = getstk(sz)
 	if stk == sys.Mapbad
 		-> `std.Err "couldn't get stack"
 	;;
-	/* store size */
-	tos = (stk : std.intptr)
-	tos -= sizeof(int64)
-	szp = (tos : sys.size#)
-	szp# = Stacksz
-
-	/* store func */
-	envsz = std.fnenvsz(fn)
-	tos -= (envsz : std.intptr)
-	env = tos
-	tos -= sizeof((->void))
-	fp = (tos : (->void)#)
-	fp# = std.fnbdup(fn, (env : byte#)[:envsz])
+	(tos, hdr) = initstk(stk, fn, sz)
 
 	tfp = [
-		.tcb = (0 : void#),
-		.tid = &ret,
+		.tcb = (hdr : void#),
+		.tid = (&hdr.tid : sys.pid#),
 		.stk = (tos : byte#),
 	]
 	ret = sys.__tfork_thread(&tfp,
@@ -56,22 +46,34 @@ const spawnstk = {fn, sz
 		(startthread : void#),
 		(0 : void#))
 	if ret < 0
+		sys.munmap(stk, sz)
 		-> `std.Err "couldn't spawn thread"
 	;;
 	-> `std.Ok (ret : tid)
 }
 
-const getstk = {sz
-	var p, m
+const initstk = {stk, fn, sz
+	var len, tos, hdr, fp, env, envsz
 
-	p = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon | sys.Mstack, -1, 0)
-	if p == sys.Mapbad
-		-> p
-	;;
-	/* stack starts at the top of memory and grows down. */
-	m = (p : std.intptr)
-	m += (sz : std.intptr)
-	-> (m : byte#)
+	len = tlslen()
+	tos = (stk : std.intptr) + (sz : std.intptr)
+	tos -= (sizeof(tlshdr) + ((len : std.intptr) * sizeof(void#)) + 0xf) & ~0xf
+	hdr = (tos : tlshdr#)
+	hdr.len = len
+	hdr.base = stk
+	hdr.stksz = sz
+
+	envsz = std.fnenvsz(fn)
+	tos -= (envsz : std.intptr)
+	env = tos
+	tos -= sizeof((->void))
+	fp = (tos : (->void)#)
+	fp# = std.fnbdup(fn, (env : byte#)[:envsz])
+	-> ((tos : byte#), hdr)
+}
+
+const getstk = {sz
+	-> sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
 }
 
 const startthread = {fn : (-> void)
diff --git a/lib/thread/spawn+osx.myr b/lib/thread/spawn+osx.myr
index 417e64aa..3e6ed161 100644
--- a/lib/thread/spawn+osx.myr
+++ b/lib/thread/spawn+osx.myr
@@ -1,9 +1,10 @@
 use sys
 use std
 
-pkg thread =
-	type tid = uint64
+use "tls"
+use "types"
 
+pkg thread =
 	const spawn : (fn : (-> void) -> std.result(tid, byte[:]))
 ;;
 
@@ -34,34 +35,13 @@ const spawn = {fn
 }
 
 const spawnstk = {fn, sz
-	var stk : byte#, tid, ret
-	var szp, f, tos, env, envsz
+	var stk, tos, ret
 
-	stk = getstk(sz)
+	stk = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
 	if stk == sys.Mapbad
 		-> `std.Err "couldn't get stack"
 	;;
-	tid = -1
-
-	/* find top of stack */
-	tos = (stk : std.intptr) + (sz : std.intptr)
-
-	/* store the stack size */
-	tos -= sizeof(sys.size)
-	sz -= sizeof(sys.size)
-	szp = (tos : sys.size#)
-	szp# = Stacksz
-
-	/* store the function we call */
-	envsz = std.fnenvsz(fn)
-	tos -= (envsz : std.intptr)
-	sz -= (envsz : sys.size)
-	env = tos
-	tos -= sizeof((->void))
-	sz -= sizeof((->void))
-	f = (tos : (->void)#)
-	f# = std.fnbdup(fn, (env : byte#)[:envsz])
-	var repr = (&fn : int64[2]#)#
+	tos = initstk(stk, fn, sz)
 
 	ret = sys.bsdthread_create( \
 		(tramp	: void#), \	/* start */
@@ -70,21 +50,37 @@ const spawnstk = {fn, sz
 		(0	: void#), \		/* pthread struct */
 		0x01000000)			/* flags (PTHREAD_START_CUSTOM): don't alloc stack in kernel */
 
-	if ret == (-1 : void#)
+	if (ret : std.size) < 0
+		sys.munmap(stk, sz)
 		-> `std.Err "couldn't spawn thread"
 	;;
-	-> `std.Ok (ret : tid)
+	-> `std.Ok (stk : tid)
 }
 
-const getstk = {sz
-	var p, m
+const initstk = {stk, fn, sz
+	var len, tos, hdr, fp, env, envsz
 
-	p = sys.mmap((0 : byte#), sz, sys.Mprotrw, sys.Mpriv | sys.Manon, -1, 0)
-	if p == sys.Mapbad
-		-> p
-	;;
-	m = (p : std.intptr)
-	-> (m : byte#)
+	len = tlslen()
+	tos = (stk : std.intptr) + (sz : std.intptr)
+	tos -= (sizeof(tlshdr) + ((len : std.intptr) * sizeof(void#)) + 0xf) & ~0xf
+	hdr = (tos : tlshdr#)
+	hdr.tid = (stk : tid)
+	hdr.len = len
+	hdr.base = stk
+	hdr.stksz = sz
+
+	var fn1 = {
+		setgsbase(hdr)
+		fn()
+	}
+
+	envsz = std.fnenvsz(fn1)
+	tos -= (envsz : std.intptr)
+	env = tos
+	tos -= sizeof((->void))
+	fp = (tos : (->void)#)
+	fp# = std.fnbdup(fn1, (env : byte#)[:envsz])
+	-> (tos : byte#)
 }
 
 /*
diff --git a/lib/thread/start+osx-x64.s b/lib/thread/start+osx-x64.s
index bb497bb8..75feb07a 100644
--- a/lib/thread/start+osx-x64.s
+++ b/lib/thread/start+osx-x64.s
@@ -15,20 +15,13 @@ _thread$start:
 	
 /*
 const thread.exit	: (stacksz : std.size -> void)
-NOTE: must be called from the bottom of the stack, since
-we assume that %rbp is in the top 4k of the stack.
 */
 .globl _thread$exit
 _thread$exit:
-	/* find top of stack */
-	movq	%rbp,%rdi	/* addr */
-	andq	$~0xfff,%rdi	/* align it */
-	addq	$0x1000,%rdi
-
 	/* munmap(base, size) */
 	movq	$0x2000049,%rax	/* munmap */
-	movq	-8(%rdi),%rsi	/* size */
-	subq	%rsi,%rdi	/* move to base ptr */
+	subq	%gs:0x08,%rdi	/* base */
+	movq	%gs:0x10,%rsi	/* stksz */
 	syscall
 
 	/* exit the thread */
diff --git a/lib/thread/test/die.myr b/lib/thread/test/die.myr
new file mode 100644
index 00000000..db0fb219
--- /dev/null
+++ b/lib/thread/test/die.myr
@@ -0,0 +1,8 @@
+use thread
+
+const main = {
+	var m = thread.mkmtx()
+	thread.mtxlock(&m)
+	thread.mtxunlock(&m)
+	thread.mtxunlock(&m)
+}
diff --git a/lib/thread/test/tls.myr b/lib/thread/test/tls.myr
new file mode 100644
index 00000000..4f8ee9e2
--- /dev/null
+++ b/lib/thread/test/tls.myr
@@ -0,0 +1,35 @@
+use std
+use sys
+use thread
+
+var start
+var end
+var wg
+
+const setget = {
+	var tid = thread.tid()
+	for var i = start; i <= end; i++
+		thread.tlsset(i, ((tid : thread.key) + i : void#))
+	;;
+	for var i = start; i <= end; i++
+		std.assert(thread.tlsget(i) == ((tid : thread.key) + i : void#),
+			"tls is broken\n")
+	;;
+	thread.wgpost(&wg)
+}
+
+const main = {
+	start = thread.gettlskey()
+	for var i = 0; i < 100; i++
+		thread.gettlskey()
+	;;
+	end = thread.gettlskey()
+
+	wg = thread.mkwg(101)
+	for var i = 0; i < 100; i++
+		thread.spawn(setget)
+	;;
+	setget()
+
+	thread.wgwait(&wg)
+}
diff --git a/lib/thread/tls+fsbase.myr b/lib/thread/tls+fsbase.myr
new file mode 100644
index 00000000..481ec883
--- /dev/null
+++ b/lib/thread/tls+fsbase.myr
@@ -0,0 +1,59 @@
+use std
+
+use "common"
+use "fsbase"
+use "types"
+
+pkg thread =
+	const        gettlskey : (-> key)
+	generic      tlsset    : (k : key, v : @a# -> void)
+	generic      tlsget    : (k : key -> @a#)
+	extern const tid       : (-> tid)
+
+	pkglocal const        tlsoob : (k : key -> void)
+	pkglocal extern const tlslen : (-> key)
+;;
+
+const Staticcap = 8
+
+var _hdr
+var _cap = Staticcap
+
+const gettlskey = {
+	std.assert(tid() == 0, "error: gettlskey must be called from main thread\n")
+	if _hdr == Zptr
+		/* `_hdr` is lazily initialized here since we can't set it in start.s */
+		_hdr = getfsbase()
+	;;
+
+	if _hdr.len++ == _cap
+		std.assert(_cap < 0x8000_0000, "error: max tls slots exceeded\n")
+		var l = sizeof(tlshdr) + ((_cap : std.size) * sizeof(void#))
+		var h = std.bytealloc(sizeof(tlshdr) + ((_cap *= 2 : std.size) * sizeof(void#)))
+
+		std.memblit(h, (_hdr : byte#), l)
+		setfsbase((h : tlshdr#))
+		/* this is ugly... the initial tls region is statically allocated */
+		if _cap != Staticcap * 2
+			std.bytefree((_hdr : byte#), l)
+		;;
+		_hdr = (h : tlshdr#)
+	;;
+	-> _hdr.len - 1
+}
+
+generic tlsset = {k, v
+	_tlsset(k, (v : void#))
+}
+
+generic tlsget = {k
+	-> (_tlsget(k) : @a#)
+}
+
+const tlsoob = {k
+	std.fput(std.Err, "error: tls key {} out of bounds {}\n", k, tlslen())
+	std.suicide()
+}
+
+extern const _tlsset : (k : key, v : void# -> void)
+extern const _tlsget : (k : key -> void#)
diff --git a/lib/thread/tls+osx.myr b/lib/thread/tls+osx.myr
new file mode 100644
index 00000000..cf958c01
--- /dev/null
+++ b/lib/thread/tls+osx.myr
@@ -0,0 +1,70 @@
+use std
+
+use "common"
+use "types"
+
+pkg thread =
+	const        gettlskey : (-> key)
+	generic      tlsset    : (k : key, v : @a# -> void)
+	generic      tlsget    : (k : key -> @a#)
+	extern const tid       : (-> tid)
+
+	pkglocal const        tlsoob    : (k : key -> void)
+	pkglocal extern const tlslen    : (-> key)
+	pkglocal const        setgsbase : (h : tlshdr# -> void)
+	pkglocal extern const getgsbase : (-> tlshdr#)
+;;
+
+const Staticcap = 8
+
+var _hdr
+var _cap = Staticcap
+
+const gettlskey = {
+	std.assert(tid() == 0, "error: gettlskey must be called from main thread\n")
+	if _hdr == Zptr
+		/* `_hdr` is lazily initialized here since we can't set it in start.s */
+		_hdr = getgsbase()
+	;;
+
+	if _hdr.len++ == _cap
+		std.assert(_cap < 0x8000_0000, "error: max tls slots exceeded\n")
+		var l = sizeof(tlshdr) + ((_cap : std.size) * sizeof(void#))
+		var h = std.bytealloc(sizeof(tlshdr) + ((_cap *= 2 : std.size) * sizeof(void#)))
+
+		std.memblit(h, (_hdr : byte#), l)
+		setgsbase((h : tlshdr#))
+		/* this is ugly... the initial tls region is statically allocated */
+		if _cap != Staticcap * 2
+			std.bytefree((_hdr : byte#), l)
+		;;
+		_hdr = (h : tlshdr#)
+	;;
+	-> _hdr.len - 1
+}
+
+generic tlsset = {k, v
+	_tlsset(k, (v : void#))
+}
+
+generic tlsget = {k
+	-> (_tlsget(k) : @a#)
+}
+
+const tlsoob = {k
+	std.fput(std.Err, "error: tls key {} out of bounds {}\n", k, tlslen())
+	std.suicide()
+}
+
+const setgsbase = {h
+	match _setgsbase(h)
+	| 0xf: /* yes, this indicates success; no, it's not documented */
+	| err:
+		std.fput(std.Err, "error: setgsbase returned {}\n", err)
+		std.suicide()
+	;;
+}
+
+extern const _tlsset    : (k : key, v : void# -> void)
+extern const _tlsget    : (k : key -> void#)
+extern const _setgsbase : (h : tlshdr# -> int64)
diff --git a/lib/thread/tls-impl+fsbase-x64.s b/lib/thread/tls-impl+fsbase-x64.s
new file mode 100644
index 00000000..d34e4219
--- /dev/null
+++ b/lib/thread/tls-impl+fsbase-x64.s
@@ -0,0 +1,48 @@
+.set tid,	0x00
+.set len,	0x04
+.set slots,	0x18
+
+/* const tid : (-> tid) */
+.globl thread$tid
+.globl _thread$tid
+thread$tid:
+_thread$tid:
+	movl	%fs:tid, %eax
+	ret
+
+/* const _tlsset : (k : key, v : void# -> void) */
+.globl thread$_tlsset
+.globl _thread$_tlsset
+thread$_tlsset:
+_thread$_tlsset:
+	cmpl	%fs:len, %edi
+	jnb	oob
+
+	movslq	%edi, %rdi
+	movq	$slots, %r10
+	movq	%rsi, %fs:(%r10, %rdi, 0x8)
+	ret
+
+/* const _tlsget : (k : key -> void#) */
+.globl thread$_tlsget
+.globl _thread$_tlsget
+thread$_tlsget:
+_thread$_tlsget:
+	cmpl	%fs:len, %edi
+	jnb	oob
+
+	movslq	%edi, %rdi
+	movq	$slots, %r10
+	movq	%fs:(%r10, %rdi, 0x8), %rax
+	ret
+
+oob:
+	call	thread$tlsoob
+
+/* const tlslen : (-> key) */
+.globl thread$tlslen
+.globl _thread$tlslen
+thread$tlslen:
+_thread$tlslen:
+	movl	%fs:len, %eax
+	ret
diff --git a/lib/thread/tls-impl+osx-x64.s b/lib/thread/tls-impl+osx-x64.s
new file mode 100644
index 00000000..bbe7dcdd
--- /dev/null
+++ b/lib/thread/tls-impl+osx-x64.s
@@ -0,0 +1,64 @@
+.set tid,	0x00
+.set len,	0x08
+.set self,	0x20
+.set slots,	0x28
+
+/* const tid : (-> tid) */
+.globl thread$tid
+.globl _thread$tid
+thread$tid:
+_thread$tid:
+	movq	%gs:tid, %rax
+	ret
+
+/* const _tlsset : (k : key, v : void# -> void) */
+.globl thread$_tlsset
+.globl _thread$_tlsset
+thread$_tlsset:
+_thread$_tlsset:
+	cmpq	%gs:len, %rdi
+	jnb	oob
+
+	movq	$slots, %r10
+	movq	%rsi, %gs:(%r10, %rdi, 0x8)
+	ret
+
+/* const _tlsget : (k : key -> void#) */
+.globl thread$_tlsget
+.globl _thread$_tlsget
+thread$_tlsget:
+_thread$_tlsget:
+	cmpq	%gs:len, %rdi
+	jnb	oob
+
+	movq	$slots, %r10
+	movq	%gs:(%r10, %rdi, 0x8), %rax
+	ret
+
+oob:
+	call	_thread$tlsoob
+
+/* const tlslen : (-> key) */
+.globl thread$tlslen
+.globl _thread$tlslen
+thread$tlslen:
+_thread$tlslen:
+	movq	%gs:len, %rax
+	ret
+
+/* const _setgsbase : (h : tlshdr# -> int64) */
+.globl thread$_setgsbase
+.globl _thread$_setgsbase
+thread$_setgsbase:
+_thread$_setgsbase:
+	movq	$0x3000003, %rax /* undocumented syscall; sets %gs to %rdi */
+	syscall
+	ret
+
+/* const getgsbase : (-> tlshdr#) */
+.globl thread$getgsbase
+.globl _thread$getgsbase
+thread$getgsbase:
+_thread$getgsbase:
+	movq	%gs:self, %rax
+	ret
diff --git a/lib/thread/types+fsbase.myr b/lib/thread/types+fsbase.myr
new file mode 100644
index 00000000..50228cd7
--- /dev/null
+++ b/lib/thread/types+fsbase.myr
@@ -0,0 +1,19 @@
+use sys
+
+pkg thread =
+	type tid = sys.pid /* 32 bits on all of the fsbase platforms */
+	type key = uint32
+
+	/*
+	XXX: Be sure to update tls-impl+fsbase.s and
+	rt/start-{freebsd,linux,netbsd,openbsd}.s if any changes are made to
+	the size of this struct and/or the offsets of any of its members.
+	 */
+	pkglocal type tlshdr = struct
+		tid   : tid
+		len   : key
+		base  : byte#
+		stksz : sys.size
+		slots : void#[...]
+	;;
+;;
diff --git a/lib/thread/types+osx.myr b/lib/thread/types+osx.myr
new file mode 100644
index 00000000..22c9c3e2
--- /dev/null
+++ b/lib/thread/types+osx.myr
@@ -0,0 +1,20 @@
+use sys
+
+pkg thread =
+	type tid = sys.pid /* 64 bits */
+	type key = uint64
+
+	/*
+	XXX: Be sure to update tls-impl+osx.s and rt/start-osx.s if any changes
+	are made to the size of this struct and/or the offsets of any of its
+	members.
+	 */
+	pkglocal type tlshdr = struct
+		tid   : tid
+		len   : key
+		base  : byte#
+		stksz : sys.size
+		self  : tlshdr#
+		slots : void#[...]
+	;;
+;;
diff --git a/mk/bootstrap/bootstrap+Darwin-x86_64.sh b/mk/bootstrap/bootstrap+Darwin-x86_64.sh
index 9e639571..3bc1f367 100755
--- a/mk/bootstrap/bootstrap+Darwin-x86_64.sh
+++ b/mk/bootstrap/bootstrap+Darwin-x86_64.sh
@@ -7,6 +7,7 @@ set -x
 	as -g -o mbld/cpufeatures.o mbld/cpufeatures+posixy-x64.s
 	as -g -o lib/thread/start.o lib/thread/start+osx-x64.s
 	as -g -o lib/thread/atomic-impl.o lib/thread/atomic-impl+x64.s
+	as -g -o lib/thread/tls-impl.o lib/thread/tls-impl+osx-x64.s
 	as -g -o lib/std/getbp.o lib/std/getbp+posixy-x64.s
 	$pwd/6/6m -I lib/sys lib/std/option.myr
 	$pwd/6/6m -I lib/sys lib/std/traits.myr
@@ -120,16 +121,18 @@ set -x
 	$pwd/6/6m -I lib/std -I lib/sys lib/bio/puti.myr
 	ar -rcs lib/bio/libbio.a lib/bio/puti.o lib/bio/geti.o lib/bio/fd.o lib/bio/mem.o lib/bio/bio.o lib/bio/types.o lib/bio/iter.o
 	$pwd/muse/muse -o lib/bio/libbio.use -p bio lib/bio/puti.use lib/bio/geti.use lib/bio/fd.use lib/bio/mem.use lib/bio/bio.use lib/bio/types.use lib/bio/iter.use
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+osx.myr
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/common.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/atomic.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/types+osx.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/tls+osx.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+osx.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/futex+osx.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/sem.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/mutex.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/hookstd.myr
-	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/start.o lib/thread/futex.o lib/thread/spawn.o
-	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
+	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/types.o lib/thread/tls.o lib/thread/tls-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/start.o lib/thread/futex.o lib/thread/spawn.o
+	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/types.use lib/thread/tls.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/opts.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/syssel.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/libs.myr
diff --git a/mk/bootstrap/bootstrap+FreeBSD-amd64.sh b/mk/bootstrap/bootstrap+FreeBSD-amd64.sh
index b5c62b13..dd835cdb 100755
--- a/mk/bootstrap/bootstrap+FreeBSD-amd64.sh
+++ b/mk/bootstrap/bootstrap+FreeBSD-amd64.sh
@@ -7,6 +7,7 @@ set -x
 	as -g -o mbld/cpufeatures.o mbld/cpufeatures+posixy-x64.s
 	as -g -o lib/thread/exit.o lib/thread/exit+freebsd-x64.s
 	as -g -o lib/thread/atomic-impl.o lib/thread/atomic-impl+x64.s
+	as -g -o lib/thread/tls-impl.o lib/thread/tls-impl+fsbase-x64.s
 	as -g -o lib/std/getbp.o lib/std/getbp+posixy-x64.s
 	$pwd/6/6m -I lib/sys lib/std/option.myr
 	$pwd/6/6m -I lib/sys lib/std/traits.myr
@@ -120,16 +121,19 @@ set -x
 	$pwd/6/6m -I lib/std -I lib/sys lib/bio/puti.myr
 	ar -rcs lib/bio/libbio.a lib/bio/puti.o lib/bio/geti.o lib/bio/fd.o lib/bio/mem.o lib/bio/bio.o lib/bio/types.o lib/bio/iter.o
 	$pwd/muse/muse -o lib/bio/libbio.use -p bio lib/bio/puti.use lib/bio/geti.use lib/bio/fd.use lib/bio/mem.use lib/bio/bio.use lib/bio/types.use lib/bio/iter.use
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+freebsd.myr
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu+freebsd.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/common.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/atomic.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/types+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/fsbase+freebsd.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/tls+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+freebsd.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu+freebsd.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/futex+freebsd.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/sem.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/mutex.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/hookstd.myr
-	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/exit.o lib/thread/futex.o lib/thread/spawn.o
-	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
+	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/types.o lib/thread/fsbase.o lib/thread/tls.o lib/thread/tls-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/exit.o lib/thread/futex.o lib/thread/spawn.o
+	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/types.use lib/thread/fsbase.use lib/thread/tls.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/opts.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/syssel.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/libs.myr
diff --git a/mk/bootstrap/bootstrap+Linux-x86_64.sh b/mk/bootstrap/bootstrap+Linux-x86_64.sh
index 0b5b7619..c3282de6 100755
--- a/mk/bootstrap/bootstrap+Linux-x86_64.sh
+++ b/mk/bootstrap/bootstrap+Linux-x86_64.sh
@@ -7,6 +7,7 @@ set -x
 	as -g -o mbld/cpufeatures.o mbld/cpufeatures+posixy-x64.s
 	as -g -o lib/thread/exit.o lib/thread/exit+linux-x64.s
 	as -g -o lib/thread/atomic-impl.o lib/thread/atomic-impl+x64.s
+	as -g -o lib/thread/tls-impl.o lib/thread/tls-impl+fsbase-x64.s
 	as -g -o lib/std/getbp.o lib/std/getbp+posixy-x64.s
 	$pwd/6/6m -I lib/sys lib/std/option.myr
 	$pwd/6/6m -I lib/sys lib/std/traits.myr
@@ -120,16 +121,19 @@ set -x
 	$pwd/6/6m -I lib/std -I lib/sys lib/bio/puti.myr
 	ar -rcs lib/bio/libbio.a lib/bio/puti.o lib/bio/geti.o lib/bio/fd.o lib/bio/mem.o lib/bio/bio.o lib/bio/types.o lib/bio/iter.o
 	$pwd/muse/muse -o lib/bio/libbio.use -p bio lib/bio/puti.use lib/bio/geti.use lib/bio/fd.use lib/bio/mem.use lib/bio/bio.use lib/bio/types.use lib/bio/iter.use
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+linux.myr
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu+linux.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/common.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/atomic.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/types+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/fsbase+linux.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/tls+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+linux.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu+linux.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/futex+linux.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/sem.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/mutex.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/hookstd.myr
-	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/exit.o lib/thread/futex.o lib/thread/spawn.o
-	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
+	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/types.o lib/thread/fsbase.o lib/thread/tls.o lib/thread/tls-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/exit.o lib/thread/futex.o lib/thread/spawn.o
+	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/types.use lib/thread/fsbase.use lib/thread/tls.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/opts.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/syssel.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/libs.myr
diff --git a/mk/bootstrap/bootstrap+NetBSD-amd64.sh b/mk/bootstrap/bootstrap+NetBSD-amd64.sh
index ae560c05..a81f8c3b 100755
--- a/mk/bootstrap/bootstrap+NetBSD-amd64.sh
+++ b/mk/bootstrap/bootstrap+NetBSD-amd64.sh
@@ -6,6 +6,7 @@ set -x
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/config.myr
 	as -g -o mbld/cpufeatures.o mbld/cpufeatures+posixy-x64.s
 	as -g -o lib/thread/atomic-impl.o lib/thread/atomic-impl+x64.s
+	as -g -o lib/thread/tls-impl.o lib/thread/tls-impl+fsbase-x64.s
 	as -g -o lib/std/getbp.o lib/std/getbp+posixy-x64.s
 	$pwd/6/6m -I lib/sys lib/std/option.myr
 	$pwd/6/6m -I lib/sys lib/std/traits.myr
@@ -119,15 +120,18 @@ set -x
 	$pwd/6/6m -I lib/std -I lib/sys lib/bio/puti.myr
 	ar -rcs lib/bio/libbio.a lib/bio/puti.o lib/bio/geti.o lib/bio/fd.o lib/bio/mem.o lib/bio/bio.o lib/bio/types.o lib/bio/iter.o
 	$pwd/muse/muse -o lib/bio/libbio.use -p bio lib/bio/puti.use lib/bio/geti.use lib/bio/fd.use lib/bio/mem.use lib/bio/bio.use lib/bio/types.use lib/bio/iter.use
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+netbsd.myr
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/common.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/atomic.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/types+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/fsbase+netbsd.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/tls+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+netbsd.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/sem.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/mutex.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/hookstd.myr
-	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/spawn.o
-	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/spawn.use
+	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/types.o lib/thread/fsbase.o lib/thread/tls.o lib/thread/tls-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/spawn.o
+	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/types.use lib/thread/fsbase.use lib/thread/tls.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/spawn.use
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/opts.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/syssel.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/libs.myr
diff --git a/mk/bootstrap/bootstrap+OpenBSD-amd64.sh b/mk/bootstrap/bootstrap+OpenBSD-amd64.sh
index 65c45ccd..ad097852 100755
--- a/mk/bootstrap/bootstrap+OpenBSD-amd64.sh
+++ b/mk/bootstrap/bootstrap+OpenBSD-amd64.sh
@@ -7,6 +7,7 @@ set -x
 	as -g -o mbld/cpufeatures.o mbld/cpufeatures+posixy-x64.s
 	as -g -o lib/thread/exit.o lib/thread/exit+openbsd-x64.s
 	as -g -o lib/thread/atomic-impl.o lib/thread/atomic-impl+x64.s
+	as -g -o lib/thread/tls-impl.o lib/thread/tls-impl+fsbase-x64.s
 	as -g -o lib/std/getbp.o lib/std/getbp+posixy-x64.s
 	$pwd/6/6m -I lib/sys lib/std/option.myr
 	$pwd/6/6m -I lib/sys lib/std/traits.myr
@@ -120,16 +121,19 @@ set -x
 	$pwd/6/6m -I lib/std -I lib/sys lib/bio/puti.myr
 	ar -rcs lib/bio/libbio.a lib/bio/puti.o lib/bio/geti.o lib/bio/fd.o lib/bio/mem.o lib/bio/bio.o lib/bio/types.o lib/bio/iter.o
 	$pwd/muse/muse -o lib/bio/libbio.use -p bio lib/bio/puti.use lib/bio/geti.use lib/bio/fd.use lib/bio/mem.use lib/bio/bio.use lib/bio/types.use lib/bio/iter.use
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+openbsd.myr
-	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu+openbsd.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/common.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/atomic.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/types+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/fsbase+openbsd.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/tls+fsbase.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/spawn+openbsd.myr
+	$pwd/6/6m -I lib/sys -I lib/std lib/thread/ncpu+openbsd.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/futex+openbsd:6.2.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/sem.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/mutex.myr
 	$pwd/6/6m -I lib/sys -I lib/std lib/thread/hookstd.myr
-	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/exit.o lib/thread/futex.o lib/thread/spawn.o
-	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
+	ar -rcs lib/thread/libthread.a lib/thread/mutex.o lib/thread/atomic.o lib/thread/atomic-impl.o lib/thread/types.o lib/thread/fsbase.o lib/thread/tls.o lib/thread/tls-impl.o lib/thread/hookstd.o lib/thread/sem.o lib/thread/common.o lib/thread/ncpu.o lib/thread/exit.o lib/thread/futex.o lib/thread/spawn.o
+	$pwd/muse/muse -o lib/thread/libthread.use -p thread lib/thread/mutex.use lib/thread/atomic.use lib/thread/types.use lib/thread/fsbase.use lib/thread/tls.use lib/thread/hookstd.use lib/thread/sem.use lib/thread/common.use lib/thread/ncpu.use lib/thread/futex.use lib/thread/spawn.use
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/opts.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/syssel.myr
 	$pwd/6/6m -I lib/sys -I lib/std -I lib/bio -I lib/regex -I lib/thread mbld/libs.myr
diff --git a/rt/start-freebsd.s b/rt/start-freebsd.s
index 9c1091b3..399fac04 100644
--- a/rt/start-freebsd.s
+++ b/rt/start-freebsd.s
@@ -4,6 +4,10 @@
 sys$__cenvp:
     .quad 0
 
+.globl thread$__tls
+thread$__tls:
+    .fill 88 /* sizeof(tlshdr) + (8 * sizeof(void#)) = 24 + 64 */
+
 .text
 /*
  * The entry point for the whole program.
@@ -11,6 +15,7 @@ sys$__cenvp:
  *  - Sets up all argc entries as slices
  *  - Converts argc/argv to a slice
  *  - Stashes a raw envp copy in __cenvp (for syscalls to use)
+ *  - Sets up thread local storage for the main thread
  *  - Calls main()
  */
 .globl _start
@@ -35,6 +40,16 @@ _start:
 	pushq	%rcx
 	call	cvt
 
+	/* set up the intial tls region for the main thread */
+	subq	$0x10,%rsp
+	movq	$165,%rax		/* sysarch */
+	movq	$129,%rdi		/* Archamd64setfs */
+	leaq	thread$__tls(%rip),%rsi
+	movq	%rsi,(%rsp)
+	movq	%rsp,%rsi
+	syscall
+	addq	$0x10,%rsp
+
 	xorq %rbp,%rbp
 	/* call pre-main initializers */
 	call	__init__
diff --git a/rt/start-linux.s b/rt/start-linux.s
index 742b4a38..a0cbfb6f 100644
--- a/rt/start-linux.s
+++ b/rt/start-linux.s
@@ -4,6 +4,10 @@
 sys$__cenvp:
     .quad 0
 
+.globl thread$__tls
+thread$__tls:
+    .fill 88 /* sizeof(tlshdr) + (8 * sizeof(void#)) = 24 + 64 */
+
 .text
 /*
  * The entry point for the whole program.
@@ -11,6 +15,7 @@ sys$__cenvp:
  *  - Sets up all argc entries as slices
  *  - Converts argc/argv to a slice
  *  - Stashes a raw envp copy in __cenvp (for syscalls to use)
+ *  - Sets up thread local storage for the main thread
  *  - Calls main()
  */
 .globl _start
@@ -36,6 +41,12 @@ _start:
 	pushq	%rcx
 	call	cvt
 
+	/* set up the intial tls region for the main thread */
+	movq	$158,%rax		/* arch_prctl */
+	movq	$0x1002,%rdi		/* Archsetfs */
+	leaq	thread$__tls(%rip),%rsi
+	syscall
+
 	xorq %rbp,%rbp
 	/* call pre-main initializers */
 	call	__init__
diff --git a/rt/start-netbsd.s b/rt/start-netbsd.s
index dd3213e5..3a6cfc70 100644
--- a/rt/start-netbsd.s
+++ b/rt/start-netbsd.s
@@ -12,6 +12,10 @@
 sys$__cenvp:
     .quad 0
 
+.globl thread$__tls
+thread$__tls:
+    .fill 88 /* sizeof(tlshdr) + (8 * sizeof(void#)) = 24 + 64 */
+
 .text
 /*
  * The entry point for the whole program.
@@ -19,6 +23,7 @@ sys$__cenvp:
  *  - Sets up all argc entries as slices
  *  - Converts argc/argv to a slice
  *  - Stashes a raw envp copy in __cenvp (for syscalls to use)
+ *  - Sets up thread local storage for the main thread
  *  - Calls main()
  */
 .globl _start
@@ -44,6 +49,16 @@ _start:
 	pushq	%rcx
 	call	cvt
 
+	/* set up the intial tls region for the main thread */
+	subq	$0x10,%rsp
+	movq	$165,%rax		/* sysarch */
+	movq	$15,%rdi		/* X8664setfsbase */
+	leaq	thread$__tls(%rip),%rsi
+	movq	%rsi,(%rsp)
+	movq	%rsp,%rsi
+	syscall
+	addq	$0x10,%rsp
+
 	xorq %rbp,%rbp
 	/* call pre-main initializers */
 	call	__init__
diff --git a/rt/start-openbsd.s b/rt/start-openbsd.s
index c0e061af..eef7b454 100644
--- a/rt/start-openbsd.s
+++ b/rt/start-openbsd.s
@@ -13,6 +13,10 @@
 sys$__cenvp:
     .quad 0
 
+.globl thread$__tls
+thread$__tls:
+    .fill 88 /* sizeof(tlshdr) + (8 * sizeof(void#)) = 24 + 64 */
+
 .text
 /*
  * The entry point for the whole program.
@@ -20,6 +24,7 @@ sys$__cenvp:
  *  - Sets up all argc entries as slices
  *  - Converts argc/argv to a slice
  *  - Stashes a raw envp copy in __cenvp (for syscalls to use)
+ *  - Sets up thread local storage for the main thread
  *  - Calls main()
  */
 .globl _start
@@ -45,6 +50,11 @@ _start:
 	pushq	%rcx
 	call	cvt
 
+	/* set up the intial tls region for the main thread */
+	movq	$329,%rax		/* Sys__set_tcb */
+	leaq	thread$__tls(%rip),%rdi
+	syscall
+
 	xorq %rbp,%rbp
 	/*
 	  we're done startup, and we kind of want
diff --git a/rt/start-osx.s b/rt/start-osx.s
index b43b30c3..a1e18568 100644
--- a/rt/start-osx.s
+++ b/rt/start-osx.s
@@ -4,6 +4,10 @@
 _sys$__cenvp:
     .quad 0
 
+.globl thread$__tls
+thread$__tls:
+    .fill 104 /* sizeof(tlshdr) + (8 * sizeof(void#)) = 40 + 64 */
+
 .text
 /*
  * The entry point for the whole program.
@@ -11,6 +15,7 @@ _sys$__cenvp:
  *  - Sets up all argc entries as slices
  *  - Converts argc/argv to a slice
  *  - Stashes a raw envp copy in __cenvp (for syscalls to use)
+ *  - Sets up thread local storage for the main thread
  *  - Calls main()
  */
 .globl start
@@ -36,6 +41,12 @@ start:
 	pushq	%rcx
 	call	cvt
 
+	/* set up the intial tls region for the main thread */
+	movq	$0x3000003,%rax		/* undocumented setgsbase syscall */
+	leaq	thread$__tls(%rip),%rdi
+	movq	%rdi,0x20(%rdi)		/* also store a copy in __tls.self */
+	syscall
+
 	xorq %rbp,%rbp
 	call	___init__
 	/* enter the main program */
diff --git a/support/syscall-gen/types+freebsd-x64.frag b/support/syscall-gen/types+freebsd-x64.frag
index f8990533..5ed178c9 100644
--- a/support/syscall-gen/types+freebsd-x64.frag
+++ b/support/syscall-gen/types+freebsd-x64.frag
@@ -32,6 +32,7 @@ type id		= int64
 type cpulevel	= int
 type cpusetid	= int
 type idtype	= int
+type sysarchop	= int
 
 type acltype	= int
 type acltag	= uint32
@@ -796,5 +797,12 @@ const Sigthr	: signo = 32	/* reserved by thread library. */
 const Siglwp	: signo = Sigthr
 const Siglibrt	: signo = 33	/* reserved by real-time library. */
 
+/* sysarch ops */
+const Archamd64getfs   : sysarchop = 128
+const Archamd64setfs   : sysarchop = 129
+const Archamd64getgs   : sysarchop = 130
+const Archamd64setgs   : sysarchop = 131
+const Archamd64getxfpu : sysarchop = 131
+
 extern const syscall : (sc:scno, args:... -> int64)
 extern var __cenvp : byte##
diff --git a/support/syscall-gen/types+linux-x64.frag b/support/syscall-gen/types+linux-x64.frag
index 9e90a5ac..966de307 100644
--- a/support/syscall-gen/types+linux-x64.frag
+++ b/support/syscall-gen/types+linux-x64.frag
@@ -38,6 +38,7 @@ type fallocmode	= uint32
 type mfdflags	= uint32
 type aiocontext	= uint64
 type msg	= void#
+type arch_prctlop	= uint64
 
 
 type clock = union
@@ -584,6 +585,12 @@ const Seekend	: whence = 2
 /* return value for a failed mapping */
 const Mapbad	: byte# = (-1 : byte#)
 
+/* arch_prctl ops */
+const Archsetgs : arch_prctlop = 0x1001
+const Archsetfs : arch_prctlop = 0x1002
+const Archgetfs : arch_prctlop = 0x1003
+const Archgetgs : arch_prctlop = 0x1004
+
 /* signal flags */
 const Sanocldstop	: sigflags = 0x00000001
 const Sanocldwait	: sigflags = 0x00000002
-- 
2.19.0


Follow-Ups:
Re: [PATCH] Add thread-local storage for POSIX-y platforms.Ori Bernstein <ori@xxxxxxxxxxxxxx>
References:
[PATCH] Add thread-local storage for POSIX-y platforms.iriri <iri@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx>