diff --git a/README.md b/README.md
index 6734b092..f47a298c 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,8 @@ reference interpreter and test suite.
 
 The Component Model is currently being incrementally developed and stabilized
 as part of [WASI Preview 2]. The subsequent [WASI Preview 3] milestone will be
-primarily concerned with the addition of [async support][Concurrency Model].
+primarily concerned with the addition of [async and thread support][Concurrency
+Model].
 
 ## Contributing
 
@@ -32,11 +33,10 @@ To contribute to any of these repositories, see the Community Group's
 [IDL]: design/mvp/WIT.md
 [Text Format]: design/mvp/Explainer.md
 [Binary Format]: design/mvp/Binary.md
-[Concurrency Model]: design/mvp/Async.md
+[Concurrency Model]: design/mvp/Concurrency.md
 [Canonical ABI]: design/mvp/CanonicalABI.md
 [formal spec]: spec/
 [W3C WebAssembly Community Group]: https://www.w3.org/community/webassembly/
 [Contributing Guidelines]: https://webassembly.org/community/contributing/
 [WASI Preview 2]: https://github.com/WebAssembly/WASI/tree/main/wasip2#readme
 [WASI Preview 3]: https://github.com/WebAssembly/WASI/tree/main/wasip2#looking-forward-to-preview-3
-[Async Support]: https://docs.google.com/presentation/d/1MNVOZ8hdofO3tI0szg_i-Yoy0N2QPU2C--LzVuoGSlE/edit?usp=share_link
diff --git a/design/mvp/Binary.md b/design/mvp/Binary.md
index f87b4d1c..df1e4a2b 100644
--- a/design/mvp/Binary.md
+++ b/design/mvp/Binary.md
@@ -297,7 +297,7 @@ canon    ::= 0x00 0x00 f:<core:funcidx> opts:<opts> ft:<typeidx> => (canon lift
            | 0x05                                                => (canon task.cancel (core func)) 🔀
            | 0x0a 0x7f i:<u32>                                   => (canon context.get i32 i (core func)) 🔀
            | 0x0b 0x7f i:<u32>                                   => (canon context.set i32 i (core func)) 🔀
-           | 0x0c cancel?:<cancel?>                              => (canon yield cancel? (core func)) 🔀
+           | 0x0c cancel?:<cancel?>                              => (canon thread.yield cancel? (core func)) 🔀
            | 0x06 async?:<async?>                                => (canon subtask.cancel async? (core func)) 🔀
            | 0x0d                                                => (canon subtask.drop (core func)) 🔀
            | 0x0e t:<typeidx>                                    => (canon stream.new t (core func)) 🔀
@@ -322,13 +322,21 @@ canon    ::= 0x00 0x00 f:<core:funcidx> opts:<opts> ft:<typeidx> => (canon lift
            | 0x21 cancel?:<cancel?> m:<core:memidx>              => (canon waitable-set.poll cancel? (memory m) (core func)) 🔀
            | 0x22                                                => (canon waitable-set.drop (core func)) 🔀
            | 0x23                                                => (canon waitable.join (core func)) 🔀
-           | 0x40 ft:<typeidx>                                   => (canon thread.spawn_ref ft (core func)) 🧵
-           | 0x41 ft:<typeidx> tbl:<core:tableidx>               => (canon thread.spawn_indirect ft tbl (core func)) 🧵
-           | 0x42                                                => (canon thread.available_parallelism (core func)) 🧵
+           | 0x26                                                => (canon thread.index (core func)) 🧵
+           | 0x27 ft:<typeidx> tbl:<core:tableidx>               => (canon thread.new_indirect ft tbl (core func)) 🧵
+           | 0x28 cancel?:<cancel?>                              => (canon thread.switch-to cancel? (core func)) 🧵
+           | 0x29 cancel?:<cancel?>                              => (canon thread.suspend cancel? (core func)) 🧵
+           | 0x2a                                                => (canon thread.resume-later (core func)) 🧵
+           | 0x2b cancel?:<cancel?>                              => (canon thread.yield-to cancel? (core func)) 🧵
+           | 0x40 shared?:<sh?> ft:<typeidx>                     => (canon thread.spawn_ref shared? ft (core func)) 🧵②
+           | 0x41 shared?:<sh?> ft:<typeidx> tbl:<core:tableidx> => (canon thread.spawn_indirect shared? ft tbl (core func)) 🧵②
+           | 0x42 shared?:<sh?>                                  => (canon thread.available-parallelism shared? (core func)) 🧵②
 async?   ::= 0x00                                                =>
            | 0x01                                                => async
 cancel?  ::= 0x00                                                =>
            | 0x01                                                => cancellable 🚟
+sh?      ::= 0x00                                                =>
+           | 0x01                                                => shared 🧵②
 opts     ::= opt*:vec(<canonopt>)                                => opt*
 canonopt ::= 0x00                                                => string-encoding=utf8
            | 0x01                                                => string-encoding=utf16
@@ -512,6 +520,8 @@ named once.
   repurposed.
 * Most built-ins should have a `<canonopt>*` immediate instead of an ad hoc
   subset of `canonopt`s.
+* Add optional `shared` immediate to all canonical definitions (explicitly or
+  via `<canonopt>`) when shared-everything-threads (🧵②) is added.
 
 
 [`core:byte`]: https://webassembly.github.io/spec/core/binary/values.html#binary-byte
diff --git a/design/mvp/CanonicalABI.md b/design/mvp/CanonicalABI.md
index f74cbe2d..01fcaab5 100644
--- a/design/mvp/CanonicalABI.md
+++ b/design/mvp/CanonicalABI.md
@@ -3,9 +3,9 @@
 This document defines the Canonical ABI used to convert between the values and
 functions of components in the Component Model and the values and functions
 of modules in Core WebAssembly. See the [AST explainer](Explainer.md) for a
-walkthrough of the static structure of a component and the
-[async explainer](Async.md) for a high-level description of the async model
-being specified here.
+walkthrough of the static structure of a component and the [concurrency
+explainer] for a high-level description of the concurrency concepts being
+specified here.
 
 * [Introduction](#introduction)
 * [Embedding](#embedding)
@@ -16,7 +16,6 @@ being specified here.
     * [Component Instance State](#component-instance-state)
     * [Table State](#table-state)
     * [Resource State](#resource-state)
-    * [Context-Local Storage](#context-local-storage)
     * [Thread State](#thread-state)
     * [Waitable State](#waitable-state)
     * [Task State](#task-state)
@@ -47,7 +46,6 @@ being specified here.
   * [`canon backpressure.{inc,dec}`](#-canon-backpressureincdec) 🔀
   * [`canon task.return`](#-canon-taskreturn) 🔀
   * [`canon task.cancel`](#-canon-taskcancel) 🔀
-  * [`canon yield`](#-canon-yield) 🔀
   * [`canon waitable-set.new`](#-canon-waitable-setnew) 🔀
   * [`canon waitable-set.wait`](#-canon-waitable-setwait) 🔀
   * [`canon waitable-set.poll`](#-canon-waitable-setpoll) 🔀
@@ -60,12 +58,19 @@ being specified here.
   * [`canon future.{read,write}`](#-canon-futurereadwrite) 🔀
   * [`canon {stream,future}.cancel-{read,write}`](#-canon-streamfuturecancel-readwrite) 🔀
   * [`canon {stream,future}.drop-{readable,writable}`](#-canon-streamfuturedrop-readablewritable) 🔀
+  * [`canon thread.index`](#-canon-threadindex) 🧵
+  * [`canon thread.new_indirect`](#-canon-threadnew_indirect) 🧵
+  * [`canon thread.switch-to`](#-canon-threadswitch-to) 🧵
+  * [`canon thread.suspend`](#-canon-threadsuspend) 🧵
+  * [`canon thread.resume-later`](#-canon-threadresume-later) 🧵
+  * [`canon thread.yield-to`](#-canon-threadyield-to) 🧵
+  * [`canon thread.yield`](#-canon-threadyield) 🧵
   * [`canon error-context.new`](#-canon-error-contextnew) 📝
   * [`canon error-context.debug-message`](#-canon-error-contextdebug-message) 📝
   * [`canon error-context.drop`](#-canon-error-contextdrop) 📝
-  * [`canon thread.spawn_ref`](#-canon-threadspawn_ref) 🧵
-  * [`canon thread.spawn_indirect`](#-canon-threadspawn_indirect) 🧵
-  * [`canon thread.available_parallelism`](#-canon-threadavailable_parallelism) 🧵
+  * [`canon thread.spawn_ref`](#-canon-threadspawn_ref) 🧵②
+  * [`canon thread.spawn_indirect`](#-canon-threadspawn_indirect) 🧵②
+  * [`canon thread.available-parallelism`](#-canon-threadavailable-parallelism) 🧵②
 
 ## Introduction
 
@@ -136,7 +141,7 @@ class Store:
         return
 ```
 The `Store.tick` method does not have an analogue in Core WebAssembly and
-enables [native async support](Async.md) in the Component Model. The
+enables [native concurrency support](Concurrency.md) in the Component Model. The
 expectation is that the host will interleave calls to `invoke` with calls to
 `tick`, repeatedly calling `tick` until there is no more work to do or the
 store is destroyed. The nondeterministic `random.shuffle` indicates that the
@@ -148,10 +153,10 @@ defined [below](#thread-state) as part of the `Thread` class.
 
 The `FuncInst` passed to `Store.invoke` is defined to take 3 parameters:
 * an optional `caller` `Supertask` which is used to maintain the
-  [async callstack](Async.md#structured-concurrency) and enforce the
+  [async callstack][Structured Concurrency] and enforce the
   non-reentrance [component invariant];
 * an `OnStart` callback that is called by a `FuncInst` to receive its arguments
-  after waiting on any [backpressure](Async.md#backpressure);
+  after waiting on any [backpressure];
 * an `OnResolve` callback that is called by a `FuncInst` with either a list
   of return values or, if cancellation has been requested, `None`.
 
@@ -180,10 +185,10 @@ passing `None` and/or skipping the call to `OnStart`).
 
 If the `FuncInst` calls `OnResolve` before returning; the returned `Call`
 object is somewhat vestigial since `request_cancellation` cannot be called.
-However, as described in the [async explainer](Async.md#structured-concurrency),
-an async call's `Thread` can keep executing after calling `OnResolve`; there's
-just nothing (currently) that the caller can know or do about it (hence there
-are currently no other methods on `Call`).
+However, as described in the [concurrency explainer], an async call's
+`Thread` can keep executing after calling `OnResolve`; there's just nothing
+(currently) that the caller can know or do about it (hence there are
+currently no other methods on `Call`).
 
 
 ## Supporting definitions
@@ -413,43 +418,17 @@ class ResourceType(Type):
 ```
 
 
-#### Context-Local Storage
-
-The `ContextLocalStorage` class implements [context-local storage], with each
-new `Task` getting a fresh, zero-initialized `ContextLocalStorage` that can be
-accessed by core wasm code using `canon context.{get,set}`. (In the future,
-when threads are integrated, each `thread.spawn`ed thread would also get a
-fresh, zero-initialized `ContextLocalStorage`.)
-```python
-class ContextLocalStorage:
-  LENGTH = 1
-  array: list[int]
-
-  def __init__(self):
-    self.array = [0] * ContextLocalStorage.LENGTH
-
-  def set(self, i, v):
-    assert(types_match_values(['i32'], [v]))
-    self.array[i] = v
-
-  def get(self, i):
-    return self.array[i]
-```
-`LENGTH` is currently set to `1`, but the plan is to increase it to `2` once
-toolchains are ready to migrate the linear-memory-stack pointer from a
-`global` to context-local storage as part of implementing threads.
-
-
 #### Thread State
 
-The `Thread` class provides a set of primitive `suspend` and `resume`
-operations that are used by the rest of the Canonical ABI definitions to
-perform stack switching. Each `Thread` is immutably contained by a single
-`Task` (defined [below](#task-state)) and a new `Thread`+`Task` pair is created
-for each export call (in `canon_lift`, below). A `thread.new` built-in will be
-added at some point in the future to allow core wasm code to explicitly create
-threads; at that point `Thread`s and `Task`s will be many-to-one, with a single
-`Task` containing many `Thread`s. But until then, `Thread` and `Task` are 1:1.
+As described in the [concurrency explainer], threads are created both
+*implicitly*, when calling a component export (in `canon_lift` below), and
+*explicitly*, when core wasm code calls the `thread.new_indirect` built-in (in
+`canon_thread_new_indirect` below). Threads are represented here by the
+`Thread` class and the [current thread] is represented by explicitly threading
+a reference to a `Thread` through all Core WebAssembly calls so that the
+`thread` parameter always points to "the current thread". The `Thread` class
+provides a set of primitive control-flow operations that are used by the rest
+of the Canonical ABI definitions.
 
 `Thread` is implemented using the Python standard library's [`threading`]
 module. While a Python [`threading.Thread`] is a preemptively-scheduled [kernel
@@ -477,6 +456,10 @@ class Thread:
   cancellable: bool
   cancelled: bool
   in_event_loop: bool
+  index: Optional[int]
+  context: list[int]
+
+  CONTEXT_LENGTH = 2
 
   def running(self):
     return self.parent_lock is not None
@@ -491,8 +474,13 @@ class Thread:
     assert(self.pending())
     return self.ready_func()
 ```
-The `in_event_loop` field is used by `Task.request_cancellation` (defined
-below). The other fields are used directly by `Thread` methods as shown next.
+The `in_event_loop` field is used by `Task.request_cancellation` to prevent
+unexpected reentrance of `callback` functions. The `index` field stores the
+index of the thread in the component instance's table and is initialized only
+once a thread is allowed to start executing (after the backpressure gate). The
+`context` field holds the [thread-local storage] accessed by the
+`context.{get,set}` built-ins. All the other fields are used directly by
+`Thread` methods as shown next.
 
 When a `Thread` is created, an internal `threading.Thread` is started and
 immediately blocked `acquire()`ing `fiber_lock` (which will be `release()`ed by
@@ -507,23 +495,31 @@ immediately blocked `acquire()`ing `fiber_lock` (which will be `release()`ed by
     self.cancellable = False
     self.cancelled = False
     self.in_event_loop = False
+    self.index = None
+    self.context = [0] * Thread.CONTEXT_LENGTH
     def fiber_func():
       self.fiber_lock.acquire()
       assert(self.running())
-      thread_func()
+      thread_func(self)
       assert(self.running())
       self.task.thread_stop(self)
+      if self.index is not None:
+        self.task.inst.table.remove(self.index)
       self.parent_lock.release()
     self.fiber = threading.Thread(target = fiber_func)
     self.fiber.start()
     self.task.thread_start(self)
     assert(self.suspended())
 ```
-`Thread`s register themselves with their parent `Task` (via `thread_start`)
+`Thread`s register themselves with their containing `Task` (via `thread_start`)
 and unregister themselves (via `thread_stop`) when they exit. This registration
 is used for delivering cancellation requests sent to the `Task` by the caller
 (via `Task.request_cancellation`) as well as enforcing Canonical ABI rules
-when the last (and currently only) `Thread` in a `Task` exits.
+when the last `Thread` in a `Task` exits.
+
+If a `Thread` was not cancelled while waiting for backpressure, it will be
+allocated an `index` in the component instance table and, when the `Thread`'s
+root function returns, this `index` is deallocated by the code above.
 
 Once a `Thread` is created, it will only start `running` when `Thread.resume`
 is called. Once a thread is `running` it can then be `suspended` again by
@@ -542,11 +538,21 @@ able to handle cancellation. This information is stored in the `cancellable`
 field which is used by `Task.request_cancellation` (defined below) to only
 `resume` with `cancel = True` when the thread expects it.
 
+Lastly, several `Thread` methods below will set the `ready_func` and add the
+`Thread` to the `Store.pending` list so that `Store.tick` will call `resume`
+when the `ready_func` returns `True`. Once `Thread.resume` is called, the
+`ready_func` is reset and the `Thread` is removed again from the
+`Store.pending` list since it's no longer in the `pending` state.
+
 Given the above, `Thread.resume` and `Thread.suspend` can be defined
 complementarily using `parent_lock` and `fiber_lock` as follows:
 ```python
   def resume(self, cancel = False):
     assert(not self.running() and not self.cancelled)
+    if self.ready_func:
+      assert(cancel or self.ready_func())
+      self.ready_func = None
+      self.task.inst.store.pending.remove(self)
     assert(self.cancellable or not cancel)
     self.cancelled = cancel
     self.parent_lock = threading.Lock()
@@ -569,10 +575,19 @@ complementarily using `parent_lock` and `fiber_lock` as follows:
     return completed
 ```
 
-A `Thread` can also suspend until an arbitrary condition (evaluated by the
-given boolean-valued `ready_func`) is satisfied. This method is used when a
-component needs to wait on some external event (in the host, another component,
-or a set of both):
+The `Thread.resume_later` method is called by `canon_thread_resume_later` below
+to add a `Thread` to the `Store.pending` list with an already-true `ready_func`
+so that `Store.tick` will call `Thread.resume` at some nondeterministic point
+in the near future:
+```python
+  def resume_later(self):
+    assert(self.suspended())
+    self.ready_func = lambda: True
+    self.task.inst.store.pending.append(self)
+```
+
+The `Thread.suspend_until` method is used by a multiple internal callers below
+to specify a custom `ready_func` that is polled by `Store.tick`:
 ```python
   def suspend_until(self, ready_func, cancellable = False) -> bool:
     assert(self.running())
@@ -580,19 +595,8 @@ or a set of both):
       return True
     self.ready_func = ready_func
     self.task.inst.store.pending.append(self)
-    completed = self.suspend(cancellable)
-    assert(cancellable or ready_func())
-    self.ready_func = None
-    self.task.inst.store.pending.remove(self)
-    return completed
+    return self.suspend(cancellable)
 ```
-By adding the current `Thread` to the `Store.pending` list, `suspend_until`
-ensures that the `ready_func` is repeatedly polled by `Store.tick` until it
-returns `True`, at which point this `Thread` will be `resume()`ed. A practical
-implementation would naturally replace this naive polling with a more-efficient
-event-triggered mechanism; the goal here is just to specify the allowed
-behaviors, not performance.
-
 The `randomint` conjunct on the early return if `ready_func()` is already
 `True` means that, at any potential suspension point, the embedder can
 nondeterministically decide whether to switch to another thread or keep running
@@ -601,6 +605,42 @@ which `suspend_until`s a condition that's already met (e.g. in the case of
 `yield`), the embedder can use scheduling heuristics to decide whether to
 consider the call `BLOCKED` or keep going.
 
+The `Thread.switch_to` method is used by `canon_thread_switch_to` below to
+suspend the current thread and resume some other thread. Importantly, the
+parent of the current thread is *transferred* to the thread being resumed. This
+ensures than when an `async`-lowered caller calls an export that does a number
+of internal `thread.switch-to`s before suspending, the `async`-lowered caller
+resumes execution immediately (as if there were no `thread.switch-to` and
+[Asyncify] was used to emulate stack switching instead).
+```python
+  def switch_to(self, cancellable, other: Thread) -> bool:
+    assert(self.running() and other.suspended())
+    assert(not self.cancellable)
+    self.cancellable = cancellable
+    assert(self.parent_lock and not other.parent_lock)
+    other.parent_lock = self.parent_lock
+    self.parent_lock = None
+    assert(not self.running() and other.running())
+    other.fiber_lock.release()
+    self.fiber_lock.acquire()
+    assert(self.running())
+    self.cancellable = False
+    completed = not self.cancelled
+    self.cancelled = False
+    return completed
+```
+
+Lastly, the `Thread.yield_to` method is used by `canon_thread_yield_to` below
+to switch execution to some other thread (like `Thread.switch_to`), but leave
+the current thread `ready` instead of `suspended`.
+```python
+  def yield_to(self, cancellable, other: Thread) -> bool:
+    assert(not self.ready_func)
+    self.ready_func = lambda: True
+    self.task.inst.store.pending.append(self)
+    return self.switch_to(cancellable, other)
+```
+
 
 #### Waitable State
 
@@ -723,13 +763,20 @@ priorities); runtimes do not have to literally randomize event delivery.
 
 #### Task State
 
-A "task" is created for each call to a component export and is implicitly
-threaded through all core function calls as the "[current task]". Tasks are
-represented by objects of the `Task` class which are created by the
-`canon_lift` function (defined below). `Task` implements the abstract `Call`
-and `Supertask` interfaces defined as part of the [Embedding](#embedding)
-interface; a `Task` serves both as the `Supertask` of calls it makes to imports
-as well as the `Call` object returned by `canon_lift`.
+As described in the [concurrency explainer], a "task" is created for each call
+to a component export (in `canon_lift` below), tracking the metadata needed to
+enforce the Canonical ABI rules associated with the callee as well as implement
+caller-requested cancellation. Each task contains 0..N threads that execute on
+behalf of the task, starting with the thread that is spawned to execute the
+exported function and transitively including additional threads spawned by that
+thread via `thread.new_indirect`.
+
+Tasks are represented here by the `Task` class and the [current task] is
+represented by the `Thread.task` field of the [current thread]. `Task`
+implements the abstract `Call` and `Supertask` interfaces defined as part of
+the [Embedding](#embedding) interface since `Task` serves as both the
+`Supertask` of calls it makes to imports as well as the `Call` object returned
+for calls to exports.
 
 `Task` is introduced in chunks, starting with fields and initialization:
 ```python
@@ -747,8 +794,7 @@ class Task(Call, Supertask):
   supertask: Optional[Task]
   on_resolve: OnResolve
   num_borrows: int
-  thread: Optional[Thread]
-  context: ContextLocalStorage
+  threads: list[Thread]
 
   def __init__(self, opts, inst, ft, supertask, on_resolve):
     self.state = Task.State.INITIAL
@@ -758,25 +804,25 @@ class Task(Call, Supertask):
     self.supertask = supertask
     self.on_resolve = on_resolve
     self.num_borrows = 0
-    self.thread = None
-    self.context = ContextLocalStorage()
+    self.threads = []
 ```
 
-The `thread` field is initialized by `Task.thread_start`, which is called by
-`Thread`'s constructor. Symmetrically, when the `Thread`'s root function
-call returns, `Task.thread_stop` is called to trap if the `OnResolve` callback
-has not been called (by the `Task.return_` and `Task.cancel` methods,
-defined below).
+The `threads` field holds the list of `Thread`s contained by this `Task` and is
+populated by `Task.thread_start`, which is called by `Thread`'s constructor.
+Symmetrically, when the `Thread`'s root function call returns,
+`Task.thread_stop` is called to trap if the `OnResolve` callback has not been
+called (by the `Task.return_` and `Task.cancel` methods, defined below).
 ```python
   def thread_start(self, thread):
-    assert(self.thread is None and thread.task is self)
-    self.thread = thread
+    assert(thread not in self.threads and thread.task is self)
+    self.threads.append(thread)
 
   def thread_stop(self, thread):
-    assert(thread is self.thread and thread.task is self)
-    self.thread = None
-    trap_if(self.state != Task.State.RESOLVED)
-    assert(self.num_borrows == 0)
+    assert(thread in self.threads and thread.task is self)
+    self.threads.remove(thread)
+    if len(self.threads) == 0:
+      trap_if(self.state != Task.State.RESOLVED)
+      assert(self.num_borrows == 0)
 ```
 
 The `Task.trap_if_on_the_stack` method checks for unintended reentrance,
@@ -787,9 +833,9 @@ distinguish between the deadlock-hazardous kind of reentrance (where the new
 task is a transitive subtask of a task already running in the same component
 instance) and the normal kind of async reentrance (where the new task is just a
 sibling of any existing tasks running in the component instance). Note that, in
-the [future](Async.md#TODO), there will be a way for a function to opt in (via
-function type attribute) to the hazardous kind of reentrance, which will nuance
-this test.
+the [future](Concurrency.md#TODO), there will be a way for a function to opt in
+(via function type attribute) to the hazardous kind of reentrance, which will
+nuance this test.
 ```python
   def trap_if_on_the_stack(self, inst):
     c = self.supertask
@@ -840,13 +886,13 @@ backpressure is disabled. There are three sources of backpressure:
     by new tasks.
 
 ```python
-  def enter(self):
-    assert(self.thread is not None)
+  def enter(self, thread):
+    assert(thread in self.threads and thread.task is self)
     def has_backpressure():
       return self.inst.backpressure > 0 or (self.needs_exclusive() and self.inst.exclusive)
     if has_backpressure() or self.inst.num_waiting_to_enter > 0:
       self.inst.num_waiting_to_enter += 1
-      completed = self.thread.suspend_until(lambda: not has_backpressure(), cancellable = True)
+      completed = thread.suspend_until(lambda: not has_backpressure(), cancellable = True)
       self.inst.num_waiting_to_enter -= 1
       if not completed:
         self.cancel()
@@ -857,9 +903,10 @@ backpressure is disabled. There are three sources of backpressure:
     return True
 ```
 Since the order in which suspended threads are resumed is nondeterministic (see
-`Store.tick` above), once `Task.enter` suspends due to backpressure, the above
-definition allows the host to arbitrarily select which tasks to resume in which
-order. Additionally, the above definition ensures the following properties:
+`Store.tick` above), once `Task.enter` suspends the [current thread] due to
+backpressure, the above definition allows the host to arbitrarily select which
+threads to resume in which order. Additionally, the above definition ensures
+the following properties:
 * While a callee is waiting to `enter`, if the caller requests cancellation,
   the callee is immediately cancelled.
 * When backpressure is disabled then reenabled, no new tasks start, even
@@ -867,12 +914,12 @@ order. Additionally, the above definition ensures the following properties:
   backpressure (i.e., disabling backpressure never unleashes an unstoppable
   thundering heard of pending tasks).
 
-Symmetrically, the `Task.exit` method is called before a `Task`'s `Thread`
+Symmetrically, the `Task.exit` method is called before a `Task`'s main `Thread`
 returns to clear the `exclusive` flag set by `Task.enter`, allowing other
 `needs_exclusive` tasks to start or make progress:
 ```python
   def exit(self):
-    assert(self.thread is not None)
+    assert(len(self.threads) > 0)
     if self.needs_exclusive():
       assert(self.inst.exclusive)
       self.inst.exclusive = False
@@ -881,21 +928,30 @@ returns to clear the `exclusive` flag set by `Task.enter`, allowing other
 The `Task.request_cancellation` method is called by the host or wasm caller
 (via the `Call` interface of `Task`) to signal that they don't need the return
 value and that the caller should hurry up and call the `OnResolve` callback. If
-the cancelled `Task`'s `Thread` is expecting cancellation (e.g., when an `async
-callback` export returns to the event loop or when `waitable-set.wait` is
-called with `cancellable` set), `request_cancellation` immediately resumes the
-thread, giving the thread the chance to handle cancellation promptly (allowing
-`subtask.cancel` to complete eagerly without returning `BLOCKED`). Otherwise,
-the cancellation request is remembered in the `Task`'s `state` so that it can
-be delivered in the future by `Task.suspend_until` (defined next).
+*any* of a cancelled `Task`'s `Thread`s are expecting cancellation (e.g., when
+an `async callback` export returns to the event loop or when a `waitable-set.*`
+or `thread.*` built-in is called with `cancellable` set), `request_cancellation`
+immediately resumes that thread (picking one nondeterministically if there are
+multiple), giving the thread the chance to handle cancellation promptly
+(allowing `subtask.cancel` to complete eagerly without returning `BLOCKED`).
+Otherwise, the cancellation request is remembered in the `Task`'s `state` so
+that it can be delivered in the future by `Task.deliver_pending_cancel`.
 ```python
   def request_cancellation(self):
     assert(self.state == Task.State.INITIAL)
-    if self.thread.cancellable and not (self.thread.in_event_loop and self.inst.exclusive):
+    random.shuffle(self.threads)
+    for thread in self.threads:
+      if thread.cancellable and not (thread.in_event_loop and self.inst.exclusive):
+        self.state = Task.State.CANCEL_DELIVERED
+        thread.resume(cancel = True)
+        return
+    self.state = Task.State.PENDING_CANCEL
+
+  def deliver_pending_cancel(self, cancellable) -> bool:
+    if cancellable and self.state == Task.State.PENDING_CANCEL:
       self.state = Task.State.CANCEL_DELIVERED
-      self.thread.resume(cancel = True)
-    else:
-      self.state = Task.State.PENDING_CANCEL
+      return True
+    return False
 ```
 `in_event_loop` is set by the `async callback` event loop (in `canon_lift`,
 defined below) every time the event loop suspends the thread and is used here
@@ -907,14 +963,32 @@ loop, which sets `cancellable`) but it cannot be resumed until the second task
 returns to its event loop (since `async callback` wasm code is non-reentrant
 and `needs_exclusive`).
 
-The `Task.suspend_until` method wraps `Thread.suspend_until` to deliver any
-pending cancellation set by `Task.request_cancellation`:
+The following `Task` methods wrap corresponding `Thread` methods after first
+delivering any pending cancellations set by `Task.request_cancellation`:
 ```python
-  def suspend_until(self, ready_func, cancellable) -> bool:
-    if cancellable and self.state == Task.State.PENDING_CANCEL:
-      self.state = Task.State.CANCEL_DELIVERED
+  def suspend(self, thread, cancellable) -> bool:
+    assert(thread in self.threads and thread.task is self)
+    if self.deliver_pending_cancel(cancellable):
       return False
-    return self.thread.suspend_until(ready_func, cancellable)
+    return thread.suspend(cancellable)
+
+  def suspend_until(self, ready_func, thread, cancellable) -> bool:
+    assert(thread in self.threads and thread.task is self)
+    if self.deliver_pending_cancel(cancellable):
+      return False
+    return thread.suspend_until(ready_func, cancellable)
+
+  def switch_to(self, thread, cancellable, other_thread) -> bool:
+    assert(thread in self.threads and thread.task is self)
+    if self.deliver_pending_cancel(cancellable):
+      return False
+    return thread.switch_to(cancellable, other_thread)
+
+  def yield_to(self, thread, cancellable, other_thread) -> bool:
+    assert(thread in self.threads and thread.task is self)
+    if self.deliver_pending_cancel(cancellable):
+      return False
+    return thread.yield_to(cancellable, other_thread)
 ```
 
 The `Task.wait_until` method is called by `canon_waitable_set_wait` and from
@@ -924,11 +998,12 @@ event to deliver *and* the caller-supplied condition is met. While suspended,
 the `num_waiting` counter is kept above `0` so that `waitable-set.drop` will
 trap if another task tries to drop the waitable set being used.
 ```python
-  def wait_until(self, ready_func, wset, cancellable) -> EventTuple:
+  def wait_until(self, ready_func, thread, wset, cancellable) -> EventTuple:
+    assert(thread in self.threads and thread.task is self)
     wset.num_waiting += 1
     def ready_and_has_event():
       return ready_func() and wset.has_pending_event()
-    if not self.suspend_until(ready_and_has_event, cancellable):
+    if not self.suspend_until(ready_and_has_event, thread, cancellable):
       event = (EventCode.TASK_CANCELLED, 0, 0)
     else:
       event = wset.get_pending_event()
@@ -943,9 +1018,10 @@ pending event, returning `EventCode.NONE` if there is none already. However,
 `poll_until` *does* call `suspsend_until` to allow the runtime to
 nondeterministically switch to another task (or not).
 ```python
-  def poll_until(self, ready_func, wset, cancellable) -> Optional[EventTuple]:
+  def poll_until(self, ready_func, thread, wset, cancellable) -> Optional[EventTuple]:
+    assert(thread in self.threads and thread.task is self)
     wset.num_waiting += 1
-    if not self.suspend_until(ready_func, cancellable):
+    if not self.suspend_until(ready_func, thread, cancellable):
       event = (EventCode.TASK_CANCELLED, 0, 0)
     elif wset.has_pending_event():
       event = wset.get_pending_event()
@@ -955,12 +1031,13 @@ nondeterministically switch to another task (or not).
     return event
 ```
 
-The `Task.yield_until` method is called by `canon_yield` and from
+The `Task.yield_until` method is called by `canon_thread_yield` and from
 the event loop in `canon_lift` when `CallbackCode.YIELD` is returned.
 `yield_until` works like `poll_until` if given a fresh empty waitable set.
 ```python
-  def yield_until(self, ready_func, cancellable) -> EventTuple:
-    if not self.suspend_until(ready_func, cancellable):
+  def yield_until(self, ready_func, thread, cancellable) -> EventTuple:
+    assert(thread in self.threads and thread.task is self)
+    if not self.suspend_until(ready_func, thread, cancellable):
       return (EventCode.TASK_CANCELLED, 0, 0)
     else:
       return (EventCode.NONE, 0, 0)
@@ -1030,7 +1107,7 @@ class Subtask(Waitable):
 ```
 
 The `state` field of `Subtask` tracks the callee's progression from the initial
-[`STARTING`](Async.md#backpressure) state along the [subtask state machine].
+[`STARTING`][Backpressure] state along the [subtask state machine].
 A `Subtask` is considered "resolved" if it has returned a value or if, after
 having had cancellation requested by the caller, called `task.cancel` (either
 before or after calling `OnStart`):
@@ -1277,7 +1354,7 @@ wasm code. Specifically, the point of the `OnCopy*` callbacks is to specify that
 *multiple* reads or writes are allowed into the same `Buffer` up until the point
 where either the buffer is full or the calling core wasm code receives a
 `STREAM_READ` or `STREAM_WRITE` progress event (in which case `ReclaimBuffer` is
-called). This reduces the number of task-switches required by the spec,
+called). This reduces the number of context-switches required by the spec,
 particularly when streaming between two components.
 
 The `SharedStreamImpl` class implements both `ReadableStream` and
@@ -1421,8 +1498,9 @@ out into the `CopyEnd` class that is derived below:
 ```python
 class CopyState(Enum):
   IDLE = 1
-  COPYING = 2
-  DONE = 3
+  SYNC_COPYING = 2
+  ASYNC_COPYING = 3
+  DONE = 4
 
 class CopyEnd(Waitable):
   state: CopyState
@@ -1431,14 +1509,19 @@ class CopyEnd(Waitable):
     Waitable.__init__(self)
     self.state = CopyState.IDLE
 
+  def copying(self):
+    return self.state == CopyState.SYNC_COPYING or self.state == CopyState.ASYNC_COPYING
+
   def drop(self):
-    trap_if(self.state == CopyState.COPYING)
+    trap_if(self.copying())
     Waitable.drop(self)
 ```
 As shown in `drop`, attempting to drop a readable or writable end while a copy
 is in progress traps. This means that client code must take care to wait for
 these operations to finish (potentially cancelling them via
-`stream.cancel-{read,write}`) before dropping.
+`stream.cancel-{read,write}`) before dropping. The `SYNC_COPY` vs. `ASYNC_COPY`
+distinction is tracked in the state to determine whether the copy operation can
+be cancelled.
 
 Given the above, we can define the concrete `{Readable,Writable}StreamEnd`
 classes which are almost entirely symmetric, with the only difference being
@@ -2063,7 +2146,7 @@ The `Subtask.add_lender` participates in the enforcement of the dynamic borrow
 rules, which keep the source handle alive until the end of the call (as a
 conservative upper bound on how long the `borrow` handle can be held). Note
 that `add_lender` is called for borrowed source handles so that they must be
-kept alive until the subtask completes, which in turn prevents the current task
+kept alive until the subtask completes, which in turn prevents the [current task]
 from `task.return`ing while its non-returned subtask still holds a
 transitively-borrowed handle.
 
@@ -3052,10 +3135,13 @@ a `lift`ed function starts executing:
 def canon_lift(opts, inst, ft, callee, caller, on_start, on_resolve) -> Call:
   task = Task(opts, inst, ft, caller, on_resolve)
   task.trap_if_on_the_stack(inst)
-  def thread_func():
-    if not task.enter():
+  def thread_func(thread):
+    if not task.enter(thread):
       return
 
+    assert(thread.index is None)
+    thread.index = inst.table.add(thread)
+
     cx = LiftLowerContext(opts, inst, task)
     args = on_start()
     flat_args = lower_flat_values(cx, MAX_FLAT_PARAMS, args, ft.param_types())
@@ -3073,9 +3159,11 @@ resolved. If the caller cancels the new `Task` while the `Task` is still
 waiting to `enter`, the call is aborted before the arguments are lowered (which
 means that owned-handle arguments are not transferred).
 
-Once the backpressure gate is cleared, the arguments are lowered into core wasm
-values and memory according to the `canonopt` immediates of `canon lift` (as
-defined by `lower_flat_values` above).
+Once the backpressure gate is cleared, the `Thread` is added to the callee's
+component instance's table (storing the index for later retrieval by the
+`thread.index` built-in) and the arguments are lowered into core wasm values
+and memory according to the `canonopt` immediates of `canon lift` (as defined
+by `lower_flat_values` above).
 
 If the `async` `canonopt` is *not* specified, a `lift`ed function then calls
 the core wasm callee, passing the lowered arguments in core function parameters
@@ -3089,13 +3177,13 @@ synchronous functions cannot overlap execution; attempts by callers to make
 overlapping calls will result in backpressure in `Task.enter`.
 ```python
     if opts.sync:
-      flat_results = call_and_trap_on_throw(callee, task, flat_args)
+      flat_results = call_and_trap_on_throw(callee, thread, flat_args)
       assert(types_match_values(flat_ft.results, flat_results))
       result = lift_flat_values(cx, MAX_FLAT_RESULTS, CoreValueIter(flat_results), ft.result_type())
       task.return_(result)
       if opts.post_return is not None:
         inst.may_leave = False
-        [] = call_and_trap_on_throw(opts.post_return, task, flat_results)
+        [] = call_and_trap_on_throw(opts.post_return, thread, flat_results)
         inst.may_leave = True
       task.exit()
       return
@@ -3110,21 +3198,20 @@ In both of the `async` cases below (with or without `callback`), the
 `task.return` built-in must be called, providing the return value as core wasm
 *parameters* to the `task.return` built-in (rather than as core function
 results as in the synchronous case). If `task.return` is *not* called by the
-time the `Task`'s last (and, currently, only) `Thread` exits, there is a trap
-(in `Task.thread_stop`).
+time the `Task`'s last `Thread` exits, there is a trap (in `Task.thread_stop`).
 
 In the `async` non-`callback` ("stackful async") case, there is a single call
 to the core wasm callee which must return empty core results. Waiting for async
 I/O happens by the callee synchronously calling built-ins like
 `waitable-set.wait`. When these built-ins need to block, they transitively call
-`Thread.suspend` which allows other concurrent tasks to make progress. Note
-that, since `Task.enter` does *not* acquire the `exclusive` lock for stackful
-async functions, calls to `waitable-set.wait` made by a stackful async function
-do not prevent other stackful async calls from starting or progressing in the
-same component instance.
+`Thread.suspend` which allows other threads to make progress. Note that, since
+`Task.enter` does *not* acquire the `exclusive` lock for stackful async
+functions, calls to `waitable-set.wait` made by a stackful async function do
+not prevent any other threads from starting or resuming in the same component
+instance.
 ```python
     if not opts.callback:
-      [] = call_and_trap_on_throw(callee, task, flat_args)
+      [] = call_and_trap_on_throw(callee, thread, flat_args)
       assert(types_match_values(flat_ft.results, []))
       task.exit()
       return
@@ -3135,26 +3222,26 @@ first calling the core wasm callee and then repeatedly calling the `callback`
 function (specified as a `funcidx` immediate in `canon lift`) until the
 `EXIT` code (`0`) is returned:
 ```python
-    [packed] = call_and_trap_on_throw(callee, task, flat_args)
+    [packed] = call_and_trap_on_throw(callee, thread, flat_args)
     code,si = unpack_callback_result(packed)
     while code != CallbackCode.EXIT:
       thread.in_event_loop = True
       inst.exclusive = False
       match code:
         case CallbackCode.YIELD:
-          event = task.yield_until(lambda: not inst.exclusive, cancellable = True)
+          event = task.yield_until(lambda: not inst.exclusive, thread, cancellable = True)
         case CallbackCode.WAIT:
           wset = inst.table.get(si)
           trap_if(not isinstance(wset, WaitableSet))
-          event = task.wait_until(lambda: not inst.exclusive, wset, cancellable = True)
+          event = task.wait_until(lambda: not inst.exclusive, thread, wset, cancellable = True)
         case CallbackCode.POLL:
           wset = inst.table.get(si)
           trap_if(not isinstance(wset, WaitableSet))
-          event = task.poll_until(lambda: not inst.exclusive, wset, cancellable = True)
+          event = task.poll_until(lambda: not inst.exclusive, thread, wset, cancellable = True)
       thread.in_event_loop = False
       inst.exclusive = True
       event_code, p1, p2 = event
-      [packed] = call_and_trap_on_throw(opts.callback, task, [event_code, p1, p2])
+      [packed] = call_and_trap_on_throw(opts.callback, thread, [event_code, p1, p2])
       code,si = unpack_callback_result(packed)
     task.exit()
     return
@@ -3184,8 +3271,8 @@ caller. If `thread_func` and the core wasm `callee` return a value (by calling
 the `OnResolve` callback) before blocking, the call will complete synchronously
 even for `async` callers. Note that if an `async` callee calls `OnResolve` and
 *then* blocks, the caller will see the call complete synchronously even though
-the callee is still running concurrently in the `Thread` created here (see the
-[Async Explainer](Async.md#structured-concurrency) for more on this).
+the callee is still running concurrently in the `Thread` created here (see
+the [concurrency explainer] for more on this).
 ```python
   thread = Thread(task, thread_func)
   thread.resume()
@@ -3221,9 +3308,9 @@ boundaries. Thus, if a component wishes to signal an error, it must use some
 sort of explicit type such as `result` (whose `error` case particular language
 bindings may choose to map to and from exceptions):
 ```python
-def call_and_trap_on_throw(callee, task, args):
+def call_and_trap_on_throw(callee, thread, args):
   try:
-    return callee(task, args)
+    return callee(thread, args)
   except CoreWebAssemblyException:
     trap()
 ```
@@ -3249,15 +3336,15 @@ validation is performed where `$callee` has type `$ft`:
 When instantiating component instance `$inst`, `$f` is defined to be the
 partially-bound closure `canon_lower($opts, $ft, $callee)` which has two
 remaining arguments passed at runtime:
-* `task`, the [current task]
+* `thread`, the [current thread]
 * `flat_args`, a list of core wasm values passed by the caller
 
 Based on this, `canon_lower` is defined in chunks as follows:
 ```python
-def canon_lower(opts, ft, callee: FuncInst, task, flat_args):
-  trap_if(not task.inst.may_leave)
+def canon_lower(opts, ft, callee: FuncInst, thread, flat_args):
+  trap_if(not thread.task.inst.may_leave)
   subtask = Subtask()
-  cx = LiftLowerContext(opts, task.inst, subtask)
+  cx = LiftLowerContext(opts, thread.task.inst, subtask)
 ```
 Each call to `canon_lower` creates a new `Subtask`. However, this `Subtask` is
 only added to the current component instance's table (below) if `async` is
@@ -3307,7 +3394,7 @@ above).
       nonlocal flat_results
       flat_results = lower_flat_values(cx, max_flat_results, result, ft.result_type(), flat_args)
 
-  subtask.callee = callee(task, on_start, on_resolve)
+  subtask.callee = callee(thread.task, on_start, on_resolve)
 ```
 The `Subtask.state` field is updated by the callbacks to keep track of the
 call progres. The `on_progress` variable starts as a no-op, but is used by the
@@ -3331,7 +3418,7 @@ use a plain synchronous function call instead, as expected.
 ```python
   if opts.sync:
     if not subtask.resolved():
-      task.thread.suspend_until(subtask.resolved)
+      thread.suspend_until(subtask.resolved)
     assert(types_match_values(flat_ft.results, flat_results))
     subtask.deliver_resolve()
     return flat_results
@@ -3356,7 +3443,7 @@ argument memory can be reused, but the result buffer has to be kept reserved.
       subtask.deliver_resolve()
       return [Subtask.State.RETURNED]
     else:
-      subtaski = task.inst.table.add(subtask)
+      subtaski = thread.task.inst.table.add(subtask)
       def on_progress():
         def subtask_event():
           if subtask.resolved():
@@ -3394,10 +3481,10 @@ Calling `$f` invokes the following function, which adds an owning handle
 containing the given resource representation to the current component
 instance's table:
 ```python
-def canon_resource_new(rt, task, rep):
-  trap_if(not task.inst.may_leave)
+def canon_resource_new(rt, thread, rep):
+  trap_if(not thread.task.inst.may_leave)
   h = ResourceHandle(rt, rep, own = True)
-  i = task.inst.table.add(h)
+  i = thread.task.inst.table.add(h)
   return [i]
 ```
 
@@ -3417,9 +3504,9 @@ Calling `$f` invokes the following function, which removes the handle from the
 current component instance's table and, if the handle was owning, calls the
 resource's destructor.
 ```python
-def canon_resource_drop(rt, sync, task, i):
-  trap_if(not task.inst.may_leave)
-  inst = task.inst
+def canon_resource_drop(rt, sync, thread, i):
+  trap_if(not thread.task.inst.may_leave)
+  inst = thread.task.inst
   h = inst.table.remove(i)
   trap_if(not isinstance(h, ResourceHandle))
   trap_if(h.rt is not rt)
@@ -3436,9 +3523,9 @@ def canon_resource_drop(rt, sync, task, i):
         callee_opts = CanonicalOptions(sync = rt.dtor_sync, callback = rt.dtor_callback)
         ft = FuncType([U32Type()],[])
         callee = partial(canon_lift, callee_opts, rt.impl, ft, rt.dtor)
-        flat_results = canon_lower(caller_opts, ft, callee, task, [h.rep])
+        flat_results = canon_lower(caller_opts, ft, callee, thread, [h.rep])
       else:
-        task.trap_if_on_the_stack(rt.impl)
+        thread.task.trap_if_on_the_stack(rt.impl)
   else:
     h.borrow_scope.num_borrows -= 1
   return flat_results
@@ -3472,8 +3559,8 @@ validation specifies:
 Calling `$f` invokes the following function, which extracts the resource
 representation from the handle in the current component instance's table:
 ```python
-def canon_resource_rep(rt, task, i):
-  h = task.inst.table.get(i)
+def canon_resource_rep(rt, thread, i):
+  h = thread.task.inst.table.get(i)
   trap_if(not isinstance(h, ResourceHandle))
   trap_if(h.rt is not rt)
   return [h.rep]
@@ -3489,17 +3576,17 @@ For a canonical definition:
 (canon context.get $t $i (core func $f))
 ```
 validation specifies:
-* `$t` must be `i32` (for now; see [here][context-local storage])
-* `$i` must be less than `ContextLocalStorage.LENGTH` (`1`)
+* `$t` must be `i32` (for now; see [here][thread-local storage])
+* `$i` must be less than `Thread.CONTEXT_LENGTH` (`2`)
 * `$f` is given type `(func (result i32))`
 
-Calling `$f` invokes the following function, which reads the [context-local
-storage] of the [current task]:
+Calling `$f` invokes the following function, which reads the [thread-local
+storage] of the [current thread]:
 ```python
-def canon_context_get(t, i, task):
+def canon_context_get(t, i, thread):
   assert(t == 'i32')
-  assert(i < ContextLocalStorage.LENGTH)
-  return [task.context.get(i)]
+  assert(i < Thread.CONTEXT_LENGTH)
+  return [thread.context[i]]
 ```
 
 
@@ -3510,17 +3597,17 @@ For a canonical definition:
 (canon context.set $t $i (core func $f))
 ```
 validation specifies:
-* `$t` must be `i32` (for now; see [here][context-local storage])
-* `$i` must be less than `ContextLocalStorage.LENGTH` (`1`)
+* `$t` must be `i32` (for now; see [here][thread-local storage])
+* `$i` must be less than `Thread.CONTEXT_LENGTH` (`2`)
 * `$f` is given type `(func (param $v i32))`
 
-Calling `$f` invokes the following function, which writes to the [context-local
-storage] of the [current task]:
+Calling `$f` invokes the following function, which writes to the [thread-local
+storage] of the [current thread]:
 ```python
-def canon_context_set(t, i, task, v):
+def canon_context_set(t, i, thread, v):
   assert(t == 'i32')
-  assert(i < ContextLocalStorage.LENGTH)
-  task.context.set(i, v)
+  assert(i < Thread.CONTEXT_LENGTH)
+  thread.context[i] = v
   return []
 ```
 
@@ -3543,9 +3630,9 @@ Calling `$f` invokes the following function, which sets the `backpressure`
 counter to `1` or `0`. `Task.enter` waits for `backpressure` to be `0` before
 allowing new tasks to start.
 ```python
-def canon_backpressure_set(task, flat_args):
+def canon_backpressure_set(thread, flat_args):
   assert(len(flat_args) == 1)
-  task.inst.backpressure = int(bool(flat_args[0]))
+  thread.task.inst.backpressure = int(bool(flat_args[0]))
   return []
 ```
 
@@ -3561,16 +3648,16 @@ validation specifies:
 
 Calling `$inc` or `$dec` invokes one of the following functions:
 ```python
-def canon_backpressure_inc(task):
-  assert(0 <= task.inst.backpressure < 2**16)
-  task.inst.backpressure += 1
-  trap_if(task.inst.backpressure == 2**16)
+def canon_backpressure_inc(thread):
+  assert(0 <= thread.task.inst.backpressure < 2**16)
+  thread.task.inst.backpressure += 1
+  trap_if(thread.task.inst.backpressure == 2**16)
   return []
 
-def canon_backpressure_dec(task):
-  assert(0 <= task.inst.backpressure < 2**16)
-  task.inst.backpressure -= 1
-  trap_if(task.inst.backpressure < 0)
+def canon_backpressure_dec(thread):
+  assert(0 <= thread.task.inst.backpressure < 2**16)
+  thread.task.inst.backpressure -= 1
+  trap_if(thread.task.inst.backpressure < 0)
   return []
 ```
 `Task.enter` waits for `backpressure` to return to `0` before allowing new
@@ -3592,9 +3679,10 @@ specifies:
 * [`lift($f.result)` above](#canonopt-validation) defines required options
 
 Calling `$f` invokes the following function which lifts the results from core
-wasm state and passes them to the caller via `Task.return_`:
+wasm state and passes them to the [current task]'s caller via `Task.return_`:
 ```python
-def canon_task_return(task, result_type, opts: LiftOptions, flat_args):
+def canon_task_return(thread, result_type, opts: LiftOptions, flat_args):
+  task = thread.task
   trap_if(not task.inst.may_leave)
   trap_if(task.opts.sync)
   trap_if(result_type != task.ft.result)
@@ -3638,9 +3726,10 @@ validation specifies:
 
 Calling `$f` cancels the [current task], confirming a previous `subtask.cancel`
 request made by a supertask and claiming that all `borrow` handles lent to the
-current task have already been dropped (and trapping in `Task.cancel` if not).
+[current task] have already been dropped (and trapping in `Task.cancel` if not).
 ```python
-def canon_task_cancel(task):
+def canon_task_cancel(thread):
+  task = thread.task
   trap_if(not task.inst.may_leave)
   trap_if(task.opts.sync)
   task.cancel()
@@ -3655,41 +3744,6 @@ case the callee expects to receive a return value) or if the task has already
 returned a value or already called `task.cancel`.
 
 
-### 🔀 `canon yield`
-
-For a canonical definition:
-```wat
-(canon yield $cancellable? (core func $f))
-```
-validation specifies:
-* `$f` is given type `(func (result i32))`
-* 🚟 - `cancellable` is allowed (otherwise it must be absent)
-
-Calling `$f` invokes the following function which yields execution so that
-others tasks can execute. This allows a long-running computation that is not
-otherwise performing I/O to avoid starving other tasks in a cooperative
-setting.
-```python
-def canon_yield(cancellable, task):
-  trap_if(not task.inst.may_leave)
-  event_code,_,_ = task.yield_until(lambda:True, cancellable)
-  match event_code:
-    case EventCode.NONE:
-      return [0]
-    case EventCode.TASK_CANCELLED:
-      return [1]
-```
-Even though `yield_until` passes `lambda:True` as the condition it is waiting
-for, `yield_until` does transitively peform a `Thread.suspend` which allows
-the embedder to nondeterministically switch to executing another task.
-
-If `cancellable` is set, then `yield` will return whether the supertask has
-already or concurrently requested cancellation. `yield` (and other cancellable
-operations) will only indicate cancellation once and thus, if a caller is not
-prepared to propagate cancellation, they can omit `cancellable` so that
-cancellation is instead delivered at a later `cancellable` call.
-
-
 ### 🔀 `canon waitable-set.new`
 
 For a canonical definition:
@@ -3702,9 +3756,9 @@ validation specifies:
 Calling `$f` invokes the following function, which adds an empty waitable set
 to the current component instance's table:
 ```python
-def canon_waitable_set_new(task):
-  trap_if(not task.inst.may_leave)
-  return [ task.inst.table.add(WaitableSet()) ]
+def canon_waitable_set_new(thread):
+  trap_if(not thread.task.inst.may_leave)
+  return [ thread.task.inst.table.add(WaitableSet()) ]
 ```
 
 
@@ -3722,21 +3776,21 @@ Calling `$f` invokes the following function which waits for progress to be made
 on a `Waitable` in the given waitable set (indicated by index `$si`) and then
 returning its `EventCode` and writing the payload values into linear memory:
 ```python
-def canon_waitable_set_wait(cancellable, mem, task, si, ptr):
-  trap_if(not task.inst.may_leave)
-  wset = task.inst.table.get(si)
+def canon_waitable_set_wait(cancellable, mem, thread, si, ptr):
+  trap_if(not thread.task.inst.may_leave)
+  wset = thread.task.inst.table.get(si)
   trap_if(not isinstance(wset, WaitableSet))
-  event = task.wait_until(lambda:True, wset, cancellable)
-  return unpack_event(mem, task, ptr, event)
+  event = thread.task.wait_until(lambda: True, thread, wset, cancellable)
+  return unpack_event(mem, thread, ptr, event)
 
-def unpack_event(mem, task, ptr, e: EventTuple):
+def unpack_event(mem, thread, ptr, e: EventTuple):
   event, p1, p2 = e
-  cx = LiftLowerContext(LiftLowerOptions(memory = mem), task.inst)
+  cx = LiftLowerContext(LiftLowerOptions(memory = mem), thread.task.inst)
   store(cx, p1, U32Type(), ptr)
   store(cx, p2, U32Type(), ptr + 4)
   return [event]
 ```
-The `lambda:True` passed to `wait_until` means that `wait_until` will only
+The `lambda: True` passed to `wait_until` means that `wait_until` will only
 wait for the given `wset` to have a pending event with no extra conditions.
 
 If `cancellable` is set, then `waitable-set.wait` will return whether the
@@ -3761,17 +3815,17 @@ Calling `$f` invokes the following function, which returns `NONE` (`0`) instead
 of blocking if there is no event available, and otherwise returns the event the
 same way as `wait`.
 ```python
-def canon_waitable_set_poll(cancellable, mem, task, si, ptr):
-  trap_if(not task.inst.may_leave)
-  wset = task.inst.table.get(si)
+def canon_waitable_set_poll(cancellable, mem, thread, si, ptr):
+  trap_if(not thread.task.inst.may_leave)
+  wset = thread.task.inst.table.get(si)
   trap_if(not isinstance(wset, WaitableSet))
-  event = task.poll_until(lambda:True, wset, cancellable)
-  return unpack_event(mem, task, ptr, event)
+  event = thread.task.poll_until(lambda: True, thread, wset, cancellable)
+  return unpack_event(mem, thread, ptr, event)
 ```
 Even though `waitable-set.poll` doesn't block until the given waitable set has
 a pending event, `poll_until` does transitively perform a `Thread.suspend`
 which allows the embedder to nondeterministically switch to executing another
-task (like `yield`).
+task (like `thread.yield`).
 
 If `cancellable` is set, then `waitable-set.poll` will return whether the
 supertask has already or concurrently requested cancellation.
@@ -3794,9 +3848,9 @@ Calling `$f` invokes the following function, which removes the indicated
 waitable set from the current component instance's table, performing the guards
 defined by `WaitableSet.drop` above:
 ```python
-def canon_waitable_set_drop(task, i):
-  trap_if(not task.inst.may_leave)
-  wset = task.inst.table.remove(i)
+def canon_waitable_set_drop(thread, i):
+  trap_if(not thread.task.inst.may_leave)
+  wset = thread.task.inst.table.remove(i)
   trap_if(not isinstance(wset, WaitableSet))
   wset.drop()
   return []
@@ -3819,14 +3873,14 @@ Calling `$f` invokes the following function which adds the Waitable indicated
 by the index `wi` to the waitable set indicated by the index `si`, removing the
 waitable from any waitable set that it is currently a member of.
 ```python
-def canon_waitable_join(task, wi, si):
-  trap_if(not task.inst.may_leave)
-  w = task.inst.table.get(wi)
+def canon_waitable_join(thread, wi, si):
+  trap_if(not thread.task.inst.may_leave)
+  w = thread.task.inst.table.get(wi)
   trap_if(not isinstance(w, Waitable))
   if si == 0:
     w.join(None)
   else:
-    wset = task.inst.table.get(si)
+    wset = thread.task.inst.table.get(si)
     trap_if(not isinstance(wset, WaitableSet))
     w.join(wset)
   return []
@@ -3847,12 +3901,12 @@ validation specifies:
 * `$f` is given type `(func (param i32))`
 * 🚝 - `async` is allowed (otherwise it must be absent)
 
-Calling `$f` sends a request to the subtask at the given index to cancel its
-execution ASAP. This request is cooperative and the subtask may take arbitrarily
-long to receive and confirm the request. If the subtask doesn't immediately
-confirm the cancellation request, `subtask.cancel` returns `BLOCKED` and the
-caller must wait for a `SUBTASK` progress update using `waitable-set` methods
-as usual.
+Calling `$f` sends a request to a nondeterministically-chosen thread of the
+subtask at the given index to cancel the subtask ASAP. This request is
+cooperative and the subtask may take arbitrarily long to receive and confirm
+the request. If the subtask doesn't immediately confirm the cancellation
+request, `subtask.cancel` returns `BLOCKED` and the caller must wait for a
+`SUBTASK` progress update using `waitable-set` methods as usual.
 
 When cancellation is confirmed the supertask will receive the final state of
 the subtask which is one of:
@@ -3868,9 +3922,9 @@ the event payload of a future `SUBTASK` event.
 ```python
 BLOCKED = 0xffff_ffff
 
-def canon_subtask_cancel(sync, task, i):
-  trap_if(not task.inst.may_leave)
-  subtask = task.inst.table.get(i)
+def canon_subtask_cancel(sync, thread, i):
+  trap_if(not thread.task.inst.may_leave)
+  subtask = thread.task.inst.table.get(i)
   trap_if(not isinstance(subtask, Subtask))
   trap_if(subtask.resolve_delivered())
   trap_if(subtask.cancellation_requested)
@@ -3881,7 +3935,7 @@ def canon_subtask_cancel(sync, task, i):
     subtask.callee.request_cancellation()
     if not subtask.resolved():
       if sync:
-        task.thread.suspend_until(subtask.resolved)
+        thread.suspend_until(subtask.resolved)
       else:
         return [BLOCKED]
   code,index,payload = subtask.get_pending_event()
@@ -3914,9 +3968,9 @@ Calling `$f` removes the subtask at the given index from the current component
 instance's table, performing the guards and bookkeeping defined by
 `Subtask.drop()`.
 ```python
-def canon_subtask_drop(task, i):
-  trap_if(not task.inst.may_leave)
-  s = task.inst.table.remove(i)
+def canon_subtask_drop(thread, i):
+  trap_if(not thread.task.inst.may_leave)
+  s = thread.task.inst.table.remove(i)
   trap_if(not isinstance(s, Subtask))
   s.drop()
   return []
@@ -3942,18 +3996,18 @@ writable end. The expectation is that, after calling `{stream,future}.new`, the
 readable end is subsequently transferred to another component (or the host) via
 `stream` or `future` parameter/result type (see `lift_{stream,future}` above).
 ```python
-def canon_stream_new(stream_t, task):
-  trap_if(not task.inst.may_leave)
+def canon_stream_new(stream_t, thread):
+  trap_if(not thread.task.inst.may_leave)
   shared = SharedStreamImpl(stream_t.t)
-  ri = task.inst.table.add(ReadableStreamEnd(shared))
-  wi = task.inst.table.add(WritableStreamEnd(shared))
+  ri = thread.task.inst.table.add(ReadableStreamEnd(shared))
+  wi = thread.task.inst.table.add(WritableStreamEnd(shared))
   return [ ri | (wi << 32) ]
 
-def canon_future_new(future_t, task):
-  trap_if(not task.inst.may_leave)
+def canon_future_new(future_t, thread):
+  trap_if(not thread.task.inst.may_leave)
   shared = SharedFutureImpl(future_t.t)
-  ri = task.inst.table.add(ReadableFutureEnd(shared))
-  wi = task.inst.table.add(WritableFutureEnd(shared))
+  ri = thread.task.inst.table.add(ReadableFutureEnd(shared))
+  wi = thread.task.inst.table.add(WritableFutureEnd(shared))
   return [ ri | (wi << 32) ]
 ```
 
@@ -3977,13 +4031,13 @@ specifies:
 The implementation of these built-ins funnels down to a single `stream_copy`
 function that is parameterized by the direction of the copy:
 ```python
-def canon_stream_read(stream_t, opts, task, i, ptr, n):
+def canon_stream_read(stream_t, opts, thread, i, ptr, n):
   return stream_copy(ReadableStreamEnd, WritableBufferGuestImpl, EventCode.STREAM_READ,
-                     stream_t, opts, task, i, ptr, n)
+                     stream_t, opts, thread, i, ptr, n)
 
-def canon_stream_write(stream_t, opts, task, i, ptr, n):
+def canon_stream_write(stream_t, opts, thread, i, ptr, n):
   return stream_copy(WritableStreamEnd, ReadableBufferGuestImpl, EventCode.STREAM_WRITE,
-                     stream_t, opts, task, i, ptr, n)
+                     stream_t, opts, thread, i, ptr, n)
 ```
 
 Introducing the `stream_copy` function in chunks, `stream_copy` first checks
@@ -3991,13 +4045,14 @@ that the element at index `i` is of the right type and allowed to start a new
 copy. (In the future, the "trap if not `IDLE`" condition could be relaxed to
 allow multiple pipelined reads or writes.)
 ```python
-def stream_copy(EndT, BufferT, event_code, stream_t, opts, task, i, ptr, n):
-  trap_if(not task.inst.may_leave)
-  e = task.inst.table.get(i)
+def stream_copy(EndT, BufferT, event_code, stream_t, opts, thread, i, ptr, n):
+  trap_if(not thread.task.inst.may_leave)
+  e = thread.task.inst.table.get(i)
   trap_if(not isinstance(e, EndT))
   trap_if(e.shared.t != stream_t.t)
   trap_if(e.state != CopyState.IDLE)
 ```
+
 Then a readable or writable buffer is created which (in `Buffer`'s constructor)
 eagerly checks the alignment and bounds of (`i`, `n`). (In the future, the
 restriction on futures/streams containing `borrow`s could be relaxed by
@@ -4005,9 +4060,10 @@ maintaining sufficient bookkeeping state to ensure that borrowed handles *or
 streams/futures of borrowed handles* could not outlive their originating call.)
 ```python
   assert(not contains_borrow(stream_t))
-  cx = LiftLowerContext(opts, task.inst, borrow_scope = None)
+  cx = LiftLowerContext(opts, thread.task.inst, borrow_scope = None)
   buffer = BufferT(stream_t.t, cx, ptr, n)
 ```
+
 Next, the `copy` method of `{Readable,Writable}{Stream,Future}End` is called to
 perform the actual read/write. The `on_copy*` callbacks passed to `copy` bind
 and store a `stream_event` closure on the readable/writable end (via the
@@ -4024,7 +4080,6 @@ until this point into a single `i32` payload for core wasm.
 ```python
   def stream_event(result, reclaim_buffer):
     reclaim_buffer()
-    assert(e.state == CopyState.COPYING)
     if result == CopyResult.DROPPED:
       e.state = CopyState.DONE
     else:
@@ -4040,8 +4095,7 @@ until this point into a single `i32` payload for core wasm.
   def on_copy_done(result):
     e.set_pending_event(partial(stream_event, result, reclaim_buffer = lambda:()))
 
-  e.state = CopyState.COPYING
-  e.copy(task.inst, buffer, on_copy, on_copy_done)
+  e.copy(thread.task.inst, buffer, on_copy, on_copy_done)
 ```
 
 When this `copy` makes progress, a `stream_event` is set on the stream end
@@ -4052,8 +4106,10 @@ synchronously and return `BLOCKED` if not:
 ```python
   if not e.has_pending_event():
     if opts.sync:
-      task.thread.suspend_until(e.has_pending_event)
+      e.state = CopyState.SYNC_COPYING
+      thread.suspend_until(e.has_pending_event)
     else:
+      e.state = CopyState.ASYNC_COPYING
       return [BLOCKED]
   code,index,payload = e.get_pending_event()
   assert(code == event_code and index == i and payload != BLOCKED)
@@ -4080,30 +4136,31 @@ specifies:
 The implementation of these built-ins funnels down to a single `future_copy`
 function that is parameterized by the direction of the copy:
 ```python
-def canon_future_read(future_t, opts, task, i, ptr):
+def canon_future_read(future_t, opts, thread, i, ptr):
   return future_copy(ReadableFutureEnd, WritableBufferGuestImpl, EventCode.FUTURE_READ,
-                     future_t, opts, task, i, ptr)
+                     future_t, opts, thread, i, ptr)
 
-def canon_future_write(future_t, opts, task, i, ptr):
+def canon_future_write(future_t, opts, thread, i, ptr):
   return future_copy(WritableFutureEnd, ReadableBufferGuestImpl, EventCode.FUTURE_WRITE,
-                     future_t, opts, task, i, ptr)
+                     future_t, opts, thread, i, ptr)
 ```
 
 Introducing the `future_copy` function in chunks, `future_copy` starts with the
 same set of guards as `stream_copy` for parameters `i` and `ptr`. The only
 difference is that, with futures, the `Buffer` length is fixed to `1`.
 ```python
-def future_copy(EndT, BufferT, event_code, future_t, opts, task, i, ptr):
-  trap_if(not task.inst.may_leave)
-  e = task.inst.table.get(i)
+def future_copy(EndT, BufferT, event_code, future_t, opts, thread, i, ptr):
+  trap_if(not thread.task.inst.may_leave)
+  e = thread.task.inst.table.get(i)
   trap_if(not isinstance(e, EndT))
   trap_if(e.shared.t != future_t.t)
   trap_if(e.state != CopyState.IDLE)
 
   assert(not contains_borrow(future_t))
-  cx = LiftLowerContext(opts, task.inst, borrow_scope = None)
+  cx = LiftLowerContext(opts, thread.task.inst, borrow_scope = None)
   buffer = BufferT(future_t.t, cx, ptr, 1)
 ```
+
 Next, the `copy` method of `{Readable,Writable}FutureEnd.copy` is called to
 perform the actual read/write. Other than the simplifications allowed by the
 absence of repeated partial copies, the main difference in the following code
@@ -4117,7 +4174,6 @@ of elements copied is not packed in the high 28 bits; they're always zero.
 ```python
   def future_event(result):
     assert((buffer.remain() == 0) == (result == CopyResult.COMPLETED))
-    assert(e.state == CopyState.COPYING)
     if result == CopyResult.DROPPED or result == CopyResult.COMPLETED:
       e.state = CopyState.DONE
     else:
@@ -4128,8 +4184,7 @@ of elements copied is not packed in the high 28 bits; they're always zero.
     assert(result != CopyResult.DROPPED or event_code == EventCode.FUTURE_WRITE)
     e.set_pending_event(partial(future_event, result))
 
-  e.state = CopyState.COPYING
-  e.copy(task.inst, buffer, on_copy_done)
+  e.copy(thread.task.inst, buffer, on_copy_done)
 ```
 
 The end of `future_copy` is the exact same as `stream_copy`: waiting if `sync`
@@ -4137,8 +4192,10 @@ and returning either the progress made or `BLOCKED`.
 ```python
   if not e.has_pending_event():
     if opts.sync:
-      task.thread.suspend_until(e.has_pending_event)
+      e.state = CopyState.SYNC_COPYING
+      thread.suspend_until(e.has_pending_event)
     else:
+      e.state = CopyState.ASYNC_COPYING
       return [BLOCKED]
   code,index,payload = e.get_pending_event()
   assert(code == event_code and index == i)
@@ -4163,35 +4220,38 @@ validation specifies:
 The implementation of these four built-ins all funnel down to a single
 parameterized `cancel_copy` function:
 ```python
-def canon_stream_cancel_read(stream_t, sync, task, i):
-  return cancel_copy(ReadableStreamEnd, EventCode.STREAM_READ, stream_t, sync, task, i)
+def canon_stream_cancel_read(stream_t, sync, thread, i):
+  return cancel_copy(ReadableStreamEnd, EventCode.STREAM_READ, stream_t, sync, thread, i)
 
-def canon_stream_cancel_write(stream_t, sync, task, i):
-  return cancel_copy(WritableStreamEnd, EventCode.STREAM_WRITE, stream_t, sync, task, i)
+def canon_stream_cancel_write(stream_t, sync, thread, i):
+  return cancel_copy(WritableStreamEnd, EventCode.STREAM_WRITE, stream_t, sync, thread, i)
 
-def canon_future_cancel_read(future_t, sync, task, i):
-  return cancel_copy(ReadableFutureEnd, EventCode.FUTURE_READ, future_t, sync, task, i)
+def canon_future_cancel_read(future_t, sync, thread, i):
+  return cancel_copy(ReadableFutureEnd, EventCode.FUTURE_READ, future_t, sync, thread, i)
 
-def canon_future_cancel_write(future_t, sync, task, i):
-  return cancel_copy(WritableFutureEnd, EventCode.FUTURE_WRITE, future_t, sync, task, i)
+def canon_future_cancel_write(future_t, sync, thread, i):
+  return cancel_copy(WritableFutureEnd, EventCode.FUTURE_WRITE, future_t, sync, thread, i)
 
-def cancel_copy(EndT, event_code, stream_or_future_t, sync, task, i):
-  trap_if(not task.inst.may_leave)
-  e = task.inst.table.get(i)
+def cancel_copy(EndT, event_code, stream_or_future_t, sync, thread, i):
+  trap_if(not thread.task.inst.may_leave)
+  e = thread.task.inst.table.get(i)
   trap_if(not isinstance(e, EndT))
   trap_if(e.shared.t != stream_or_future_t.t)
-  trap_if(e.state != CopyState.COPYING)
+  trap_if(e.state != CopyState.ASYNC_COPYING)
   if not e.has_pending_event():
     e.shared.cancel()
     if not e.has_pending_event():
       if sync:
-        task.thread.suspend_until(e.has_pending_event)
+        thread.suspend_until(e.has_pending_event)
       else:
         return [BLOCKED]
   code,index,payload = e.get_pending_event()
-  assert(e.state != CopyState.COPYING and code == event_code and index == i)
+  assert(not e.copying() and code == event_code and index == i)
   return [payload]
 ```
+Cancellation traps if there is not currently an async copy in progress (sync
+copies do not expect or check for cancellation and thus cannot be cancelled).
+
 The *first* check for `e.has_pending_event()` catches the case where the copy has
 already racily finished, in which case we must *not* call `cancel()`. Calling
 `cancel()` may, but is not required to, recursively call one of the `on_*`
@@ -4232,21 +4292,21 @@ the given index from the current component instance's table, performing the
 guards and bookkeeping defined by `{Readable,Writable}{Stream,Future}End.drop()`
 above.
 ```python
-def canon_stream_drop_readable(stream_t, task, i):
-  return drop(ReadableStreamEnd, stream_t, task, i)
+def canon_stream_drop_readable(stream_t, thread, i):
+  return drop(ReadableStreamEnd, stream_t, thread, i)
 
-def canon_stream_drop_writable(stream_t, task, hi):
-  return drop(WritableStreamEnd, stream_t, task, hi)
+def canon_stream_drop_writable(stream_t, thread, hi):
+  return drop(WritableStreamEnd, stream_t, thread, hi)
 
-def canon_future_drop_readable(future_t, task, i):
-  return drop(ReadableFutureEnd, future_t, task, i)
+def canon_future_drop_readable(future_t, thread, i):
+  return drop(ReadableFutureEnd, future_t, thread, i)
 
-def canon_future_drop_writable(future_t, task, hi):
-  return drop(WritableFutureEnd, future_t, task, hi)
+def canon_future_drop_writable(future_t, thread, hi):
+  return drop(WritableFutureEnd, future_t, thread, hi)
 
-def drop(EndT, stream_or_future_t, task, hi):
-  trap_if(not task.inst.may_leave)
-  e = task.inst.table.remove(hi)
+def drop(EndT, stream_or_future_t, thread, hi):
+  trap_if(not thread.task.inst.may_leave)
+  e = thread.task.inst.table.remove(hi)
   trap_if(not isinstance(e, EndT))
   trap_if(e.shared.t != stream_or_future_t.t)
   e.drop()
@@ -4254,6 +4314,224 @@ def drop(EndT, stream_or_future_t, task, hi):
 ```
 
 
+### 🧵 `canon thread.index`
+
+For a canonical definition:
+```wat
+(canon thread.index (core func $index))
+```
+validation specifies:
+* `$index` is given type `(func (result i32))`
+
+Calling `$index` invokes the following function, which extracts the index
+of the [current thread]:
+```python
+def canon_thread_index(thread):
+  assert(thread.index is not None)
+  return [thread.index]
+```
+
+
+### 🧵 `canon thread.new_indirect`
+
+For a canonical definition:
+```wat
+(canon thread.new_indirect $ft $ftbl (core func $new_indirect))
+```
+validation specifies
+* `$ft` must refer to the type `(func (param $c i32))`
+* `$ftbl` must refer to a table whose element type matches `funcref`
+* `$new_indirect` is given type `(func (param $fi i32) (param $c i32) (result i32))`
+
+Calling `$new_indirect` invokes the following function which reads a `funcref`
+from `$ftbl` (trapping if out-of-bounds, null or the wrong type), calls the
+`funcref` passing the closure parameter `$c`, and returns the index of the new
+thread in the current component instance's table.
+```python
+@dataclass
+class CoreFuncRef:
+  t: CoreFuncType
+  callee: Callable[[Thread, list[CoreValType]], list[CoreValType]]
+
+def canon_thread_new_indirect(ft, ftbl: Table[CoreFuncRef], thread, fi, c):
+  trap_if(not thread.task.inst.may_leave)
+  f = ftbl.get(fi)
+  assert(ft == CoreFuncType(['i32'], []))
+  trap_if(f.t != ft)
+  def thread_func(thread):
+    [] = call_and_trap_on_throw(f.callee, thread, [c])
+  new_thread = Thread(thread.task, thread_func)
+  assert(new_thread.suspended())
+  new_thread.index = thread.task.inst.table.add(new_thread)
+  return [new_thread.index]
+```
+The newly-created thread starts out in a "suspended" state and so, to
+actually start executing, Core WebAssembly code must call one of the other
+`thread.*` built-ins defined below.
+
+
+### 🧵 `canon thread.switch-to`
+
+For a canonical definition:
+```wat
+(canon thread.switch-to $cancellable? (core func $switch-to))
+```
+validation specifies:
+* `$switch-to` is given type `(func (param $i i32) (result i32))`
+
+Calling `$switch-to` invokes the following function which loads a thread at
+index `$i` from the current component instance's table, traps if it's not
+[suspended], and then switches to that thread, leaving the [current thread]
+suspended.
+```python
+class SuspendResult(IntEnum):
+  COMPLETED = 0
+  CANCELLED = 1
+
+def canon_thread_switch_to(cancellable, thread, i):
+  trap_if(not thread.task.inst.may_leave)
+  other_thread = thread.task.inst.table.get(i)
+  trap_if(not isinstance(other_thread, Thread))
+  trap_if(not other_thread.suspended())
+  if not thread.task.switch_to(thread, cancellable, other_thread):
+    assert(cancellable)
+    return [SuspendResult.CANCELLED]
+  else:
+    return [SuspendResult.COMPLETED]
+```
+If `cancellable` is set, then `thread.switch-to` will return whether the
+supertask has already or concurrently requested cancellation. `thread.switch-to`
+(and other cancellable operations) will only indicate cancellation once and
+thus, if a caller is not prepared to propagate cancellation, they can omit
+`cancellable` so that cancellation is instead delivered at a later
+`cancellable` call.
+
+
+### 🧵 `canon thread.suspend`
+
+For a canonical definition:
+```wat
+(canon thread.suspend $cancellable? (core func $suspend))
+```
+validation specifies:
+* `$suspend` is given type `(func (result i32))`
+
+Calling `$suspend` invokes the following function which suspends the [current
+thread], immediately returning control flow to any transitive `async`-lowered
+calling component.
+```python
+def canon_thread_suspend(cancellable, thread):
+  trap_if(not thread.task.inst.may_leave)
+  if not thread.task.suspend(thread, cancellable):
+    assert(cancellable)
+    return [SuspendResult.CANCELLED]
+  else:
+    return [SuspendResult.COMPLETED]
+```
+If `cancellable` is set, then `thread.suspend` will return whether the
+supertask has already or concurrently requested cancellation. `thread.suspend`
+(and other cancellable operations) will only indicate cancellation once and
+thus, if a caller is not prepared to propagate cancellation, they can omit
+`cancellable` so that cancellation is instead delivered at a later
+`cancellable` call.
+
+
+### 🧵 `canon thread.resume-later`
+
+For a canonical definition:
+```wat
+(canon thread.resume-later (core func $resume-later))
+```
+validation specifies:
+* `$resume-later` is given type `(func (param $i i32))`
+
+Calling `$resume-later` invokes the following function which loads a thread at
+index `$i` from the current component instance's table, traps if it's not
+[suspended], and then marks that thread as ready to run at some
+nondeterministic point in the future chosen by the embedder.
+```python
+def canon_thread_resume_later(thread, i):
+  trap_if(not thread.task.inst.may_leave)
+  other_thread = thread.task.inst.table.get(i)
+  trap_if(not isinstance(other_thread, Thread))
+  trap_if(not other_thread.suspended())
+  other_thread.resume_later()
+  return []
+```
+`thread.resume-later` never suspends the [current thread] and so there is no
+possibility of cancellation and thus no `cancellable` immediate.
+
+
+### 🧵 `canon thread.yield-to`
+
+For a canonical definition:
+```wat
+(canon thread.yield-to $cancellable? (core func $yield-to))
+```
+validation specifies:
+* `$yield-to` is given type `(func (param $i i32) (result i32))`
+* 🚟 - `cancellable` is allowed (otherwise it must be absent)
+
+Calling `$yield-to` invokes the following function which loads a thread at
+index `$i` from the current component instance's table, traps if it's not
+[suspended], and then switches to that thread, leaving the [current thread]
+ready to run at some nondeterministic point in the future chosen by the
+embedder.
+```python
+def canon_thread_yield_to(cancellable, thread, i):
+  trap_if(not thread.task.inst.may_leave)
+  other_thread = thread.task.inst.table.get(i)
+  trap_if(not isinstance(other_thread, Thread))
+  trap_if(not other_thread.suspended())
+  if not thread.task.yield_to(thread, cancellable, other_thread):
+    assert(cancellable)
+    return [SuspendResult.CANCELLED]
+  else:
+    return [SuspendResult.COMPLETED]
+```
+If `cancellable` is set, then `thread.yield-to` will return whether the
+supertask has already or concurrently requested cancellation. `thread.yield-to`
+(and other cancellable operations) will only indicate cancellation once and
+thus, if a caller is not prepared to propagate cancellation, they can omit
+`cancellable` so that cancellation is instead delivered at a later
+`cancellable` call.
+
+
+### 🧵 `canon thread.yield`
+
+For a canonical definition:
+```wat
+(canon thread.yield $cancellable? (core func $yield))
+```
+validation specifies:
+* `$yield` is given type `(func (result i32))`
+
+Calling `$yield` invokes the following function which yields execution so that
+others threads can execute, leaving the current thread ready to run at some
+nondeterministic point in the future chosen by the embedder. This allows a
+long-running computation that is not otherwise performing I/O to avoid starving
+other threads in a cooperative setting.
+```python
+def canon_thread_yield(cancellable, thread):
+  trap_if(not thread.task.inst.may_leave)
+  event_code,_,_ = thread.task.yield_until(lambda: True, thread, cancellable)
+  match event_code:
+    case EventCode.NONE:
+      return [SuspendResult.COMPLETED]
+    case EventCode.TASK_CANCELLED:
+      return [SuspendResult.CANCELLED]
+```
+Even though `yield_until` passes `lambda: True` as the condition it is waiting
+for, `yield_until` does transitively peform a `Thread.suspend` which allows
+the embedder to nondeterministically switch to executing another thread.
+
+If `cancellable` is set, then `thread.yield` will return whether the supertask
+has already or concurrently requested cancellation. `thread.yield` (and other
+cancellable operations) will only indicate cancellation once and thus, if a
+caller is not prepared to propagate cancellation, they can omit `cancellable`
+so that cancellation is instead delivered at a later `cancellable` call.
+
+
 ### 📝 `canon error-context.new`
 
 For a canonical definition:
@@ -4274,15 +4552,15 @@ index.
 class ErrorContext:
   debug_message: String
 
-def canon_error_context_new(opts, task, ptr, tagged_code_units):
-  trap_if(not task.inst.may_leave)
+def canon_error_context_new(opts, thread, ptr, tagged_code_units):
+  trap_if(not thread.task.inst.may_leave)
   if DETERMINISTIC_PROFILE or random.randint(0,1):
     s = String(('', 'utf8', 0))
   else:
-    cx = LiftLowerContext(opts, task.inst)
+    cx = LiftLowerContext(opts, thread.task.inst)
     s = load_string_from_range(cx, ptr, tagged_code_units)
     s = host_defined_transformation(s)
-  i = task.inst.table.add(ErrorContext(s))
+  i = thread.task.inst.table.add(ErrorContext(s))
   return [i]
 ```
 Supporting the requirement (introduced in the
@@ -4313,17 +4591,18 @@ value may non-deterministically discard or transform the debug message, a
 single `error-context` value must return the same debug message from
 `error.debug-message` over time.
 ```python
-def canon_error_context_debug_message(opts, task, i, ptr):
-  trap_if(not task.inst.may_leave)
-  errctx = task.inst.table.get(i)
+def canon_error_context_debug_message(opts, thread, i, ptr):
+  trap_if(not thread.task.inst.may_leave)
+  errctx = thread.task.inst.table.get(i)
   trap_if(not isinstance(errctx, ErrorContext))
-  cx = LiftLowerContext(opts, task.inst)
+  cx = LiftLowerContext(opts, thread.task.inst)
   store_string(cx, errctx.debug_message, ptr)
   return []
 ```
 Note that `ptr` points to an 8-byte region of memory into which will be stored
 the pointer and length of the debug string (allocated via `opts.realloc`).
 
+
 ### 📝 `canon error-context.drop`
 
 For a canonical definition:
@@ -4336,25 +4615,29 @@ validation specifies:
 Calling `$f` calls the following function, which drops the error context value
 at the given index from the current component instance's table:
 ```python
-def canon_error_context_drop(task, i):
-  trap_if(not task.inst.may_leave)
-  errctx = task.inst.table.remove(i)
+def canon_error_context_drop(thread, i):
+  trap_if(not thread.task.inst.may_leave)
+  errctx = thread.task.inst.table.remove(i)
   trap_if(not isinstance(errctx, ErrorContext))
   return []
 ```
 
 
-### 🧵 `canon thread.spawn_ref`
+### 🧵② `canon thread.spawn_ref`
 
 For a canonical definition:
 ```wat
-(canon thread.spawn_ref $ft (core func $spawn_ref))
+(canon thread.spawn_ref shared? $ft (core func $spawn_ref))
 ```
 validation specifies:
-* `$ft` must refer to a `shared` function type; initially, only the type
-  `(shared (func (param $c i32)))` is allowed (see explanation below)
-* `$spawn_ref` is given type `(func (param $f (ref null $ft)) (param $c i32)
-  (result $e i32))`.
+* `$ft` must refer to the type `(shared? (func (param $c i32)))` (see explanation below)
+* `$spawn_ref` is given type
+  `(shared? (func (param $f (ref null $ft)) (param $c i32) (result $e i32)))`
+
+When the `shared` immediate is not present, the spawned thread is
+*cooperative*, only switching at specific program points. When the `shared`
+immediate is present, the spawned thread is *preemptive* and able to execute in
+parallel with all other threads.
 
 > Note: ideally, a thread could be spawned with [arbitrary thread parameters].
 > Currently, that would require additional work in the toolchain to support so,
@@ -4364,86 +4647,66 @@ validation specifies:
 > The inclusion of `$ft` ensures backwards compatibility for when arbitrary
 > parameters are allowed.
 
-Calling `$spawn_ref` checks that the reference `$f` is not null. Then, it spawns
-a thread which:
-  - invokes `$f` with `$c`
-  - executes `$f` until completion or trap in a `shared` context as described by
-    the [shared-everything threads] proposal.
-
-In pseudocode, `$spawn_ref` looks like:
-
+Calling `$spawn_ref` invokes the following function which simply fuses the
+`thread.new_ref` and `thread.resume-later` built-ins, allowing
+thread-creation to skip the intermediate "suspended" state transition.
 ```python
-def canon_thread_spawn_ref(f, c):
-  trap_if(f is None)
+def canon_thread_spawn_ref(shared, ft, thread, f, c):
+  trap_if(not thread.task.inst.may_leave)
   if DETERMINISTIC_PROFILE:
-    return [-1]
-
-  def thread_start():
-    try:
-      f(c)
-    except CoreWebAssemblyException:
-      trap()
-
-  if spawn(thread_start):
     return [0]
-  else:
-    return [-1]
+  [new_thread_index] = canon_thread_new_ref(shared, ft, thread, f, c)
+  [] = canon_thread_resume_later(shared, thread, new_thread_index)
+  return [new_thread_index]
 ```
+Note: `canon_thread_new_ref` has not yet been defined, but will be added as
+part of adding a [GC ABI option] to the Canonical ABI and would work
+like `canon_thread_new_indirect` minus the table access and type check.
 
 
-### 🧵 `canon thread.spawn_indirect`
+### 🧵② `canon thread.spawn_indirect`
 
 For a canonical definition:
 ```wat
-(canon thread.spawn_indirect $ft $tbl (core func $spawn_indirect))
+(canon thread.spawn_indirect shared? $ft $tbl (core func $spawn_indirect))
 ```
 validation specifies:
-* `$ft` must refer to a `shared` function type; initially, only the type
-  `(shared (func (param $c i32)))` is allowed (see explanation in
-  `thread.spawn_ref` above)
-* `$tbl` must refer to a shared table whose element type matches `(ref null
-  (shared func))`
-* `$spawn_indirect` is given type `(func (param $i i32) (param $c i32) (result
-  $e i32))`.
-
-Calling `$spawn_indirect` retrieves a reference to function `$f` from table
-`$tbl` and checks that `$f` is of type `$ft`. If that succeeds, it spawns a
-thread which:
-  - invokes `$f` with `$c`
-  - executes `$f` until completion or trap in a `shared` context as described by
-    the [shared-everything threads] proposal.
-
-In pseudocode, `$spawn_indirect` looks like:
-
-```python
-def canon_thread_spawn_indirect(ft, tbl, i, c):
-  f = tbl[i]
-  trap_if(f is None)
-  trap_if(f.type != ft)
+* `$ft` must refer to the type `(shared? (func (param $c i32)))` is allowed
+  (see explanation in `thread.spawn_ref` above)
+* `$tbl` must refer to a shared table whose element type matches
+  `(ref null (shared? func))`
+* `$spawn_indirect` is given type
+  `(shared? (func (param $i i32) (param $c i32) (result $e i32)))`
+
+When the `shared` immediate is not present, the spawned thread is
+*cooperative*, only switching at specific program points. When the `shared`
+immediate is present, the spawned thread is *preemptive* and able to execute in
+parallel with all other threads.
+
+Calling `$spawn_indirect` invokes the following function which simply fuses
+the `thread.new_indirect` and `thread.resume-later` built-ins, allowing
+thread-creation to skip the intermediate "suspended" state transition.
+```python
+def canon_thread_spawn_indirect(shared, ft, ftbl: Table[CoreFuncRef], thread, fi, c):
+  trap_if(not thread.task.inst.may_leave)
   if DETERMINISTIC_PROFILE:
-    return [-1]
-
-  def thread_start():
-    try:
-      f(c)
-    except CoreWebAssemblyException:
-      trap()
-
-  if spawn(thread_start):
     return [0]
-  else:
-    return [-1]
+  [new_thread_index] = canon_thread_new_indirect(shared, ft, ftbl, thread, fi, c)
+  [] = canon_thread_resume_later(shared, thread, new_thread_index)
+  return [new_thread_index]
 ```
+Note: `canon_thread_new_indirect` has not yet been extended to take a
+`shared` parameter, but will be as [shared-everything-threads] progresses.
 
 
-### 🧵 `canon thread.available_parallelism`
+### 🧵② `canon thread.available-parallelism`
 
 For a canonical definition:
 ```wat
-(canon thread.available_parallelism (core func $f))
+(canon thread.available-parallelism shared? (core func $f))
 ```
 validation specifies:
-* `$f` is given type `(func shared (result i32))`.
+* `$f` is given type `(func shared? (result i32))`.
 
 Calling `$f` returns the number of threads the underlying hardware can be
 expected to execute in parallel. This value can be artificially limited by
@@ -4469,16 +4732,21 @@ def canon_thread_available_parallelism():
 [JavaScript Embedding]: Explainer.md#JavaScript-embedding
 [Adapter Functions]: FutureFeatures.md#custom-abis-via-adapter-functions
 [Shared-Everything Dynamic Linking]: examples/SharedEverythingDynamicLinking.md
-[Structured Concurrency]: Async.md#structured-concurrency
-[Backpressure]: Async.md#backpressure
-[Current Task]: Async.md#current-task
-[Subtasks]: Async.md#structured-concurrency
-[Readable and Writable Ends]: Async.md#streams-and-futures
-[Readable or Writable End]: Async.md#streams-and-futures
-[Context-Local Storage]: Async.md#context-local-storage
-[Subtask State Machine]: Async.md#cancellation
-[Stream Readiness]: Async.md#stream-readiness
+[Concurrency Explainer]: Concurrency.md
+[Suspended]: Concurrency#waiting
+[Structured Concurrency]: Concurrency.md#structured-concurrency
+[Backpressure]: Concurrency.md#backpressure
+[Current Thread]: Concurrency.md#current-thread-and-task
+[Current Task]: Concurrency.md#current-thread-and-task
+[Subtasks]: Concurrency.md#structured-concurrency
+[Readable and Writable Ends]: Concurrency.md#streams-and-futures
+[Readable or Writable End]: Concurrency.md#streams-and-futures
+[Thread-Local Storage]: Concurrency.md#thread-local-storage
+[Subtask State Machine]: Concurrency.md#cancellation
+[Stream Readiness]: Concurrency.md#stream-readiness
+
 [Lazy Lowering]: https://github.com/WebAssembly/component-model/issues/383
+[GC ABI Option]: https://github.com/WebAssembly/component-model/issues/525
 
 [Core WebAssembly Embedding]: https://webassembly.github.io/spec/core/appendix/embedding.html
 [`store_init`]: https://webassembly.github.io/spec/core/appendix/embedding.html#store
@@ -4516,9 +4784,8 @@ def canon_thread_available_parallelism():
 [`import_name`]: https://clang.llvm.org/docs/AttributeReference.html#import-name
 [`export_name`]: https://clang.llvm.org/docs/AttributeReference.html#export-name
 
+[shared-everything-threads]: https://github.com/WebAssembly/shared-everything-threads
 [Arbitrary Thread Parameters]: https://github.com/WebAssembly/shared-everything-threads/discussions/3
-[wasi-libc Convention]: https://github.com/WebAssembly/wasi-libc/blob/925ad6d7/libc-top-half/musl/src/thread/pthread_create.c#L318
-[Shared-Everything Threads]: https://github.com/WebAssembly/shared-everything-threads/blob/main/proposals/shared-everything-threads/Overview.md
 
 [`threading`]: https://docs.python.org/3/library/threading.html
 [`threading.Thread`]: https://docs.python.org/3/library/threading.html#thread-objects
diff --git a/design/mvp/Async.md b/design/mvp/Concurrency.md
similarity index 68%
rename from design/mvp/Async.md
rename to design/mvp/Concurrency.md
index a3d02b0f..39fa673c 100644
--- a/design/mvp/Async.md
+++ b/design/mvp/Concurrency.md
@@ -1,21 +1,18 @@
-# 🔀 Async Explainer
+# Concurrency Explainer
 
-*This explainer describes a feature that is part of the forthcoming "Preview 3"
-release of the Component Model. The relevant parts of the [AST explainer],
-[binary format] and [Canonical ABI explainer] are gated by the 🔀 emoji.*
-
-This explainer provides a high-level summary of the native async support in the
-Component Model. For a detailed presentation of the runtime semantics, see the
-[Canonical ABI explainer]. See also the [Wasm I/O 2024 presentation] for a
-summary of the motivation and animated sketch of the design in action.
+This document contains a high-level summary of the native concurrency support
+added as part of [WASI Preview 3], providing background for understanding the
+definitions in the [WIT], [AST explainer], [binary format] and [Canonical ABI
+explainer] documents that are gated by the 🔀 (async) and 🧵 (threading)
+emojis. For an even higher-level introduction, see [these][wasmio-2024]
+[presentations][wasmio-2025].
 
 * [Goals](#goals)
-* [High-level Approach](#high-level-approach)
+* [Summary](#summary)
 * [Concepts](#concepts)
-  * [Sync and Async Functions](#sync-and-async-functions)
-  * [Task](#task)
-  * [Current task](#current-task)
-  * [Context-Local Storage](#context-local-storage)
+  * [Threads and Tasks](#threads-and-tasks)
+  * [Current Thread and Task](#current-thread-and-task)
+  * [Thread-Local Storage](#thread-local-storage)
   * [Structured concurrency](#structured-concurrency)
   * [Streams and Futures](#streams-and-futures)
   * [Stream Readiness](#stream-readiness)
@@ -26,7 +23,6 @@ summary of the motivation and animated sketch of the design in action.
   * [Cancellation](#cancellation)
   * [Nondeterminism](#nondeterminism)
 * [Interaction with the start function](#interaction-with-the-start-function)
-* [Interaction with multi-threading](#interaction-with-multi-threading)
 * [Async ABI](#async-abi)
   * [Async Import ABI](#async-import-abi)
   * [Async Export ABI](#async-export-abi)
@@ -36,68 +32,117 @@ summary of the motivation and animated sketch of the design in action.
 
 ## Goals
 
-Given only *synchronous* functions with values and resources, when a component
-needs to do concurrent (i.e., overlapping, interleaved, streaming) I/O, the
-resulting [WIT] interfaces and implementations end up being complex,
-hard to compose, and less efficient. By extending the Component Model with
-built-in **asynchronous** support, these pain points can be addressed.
-
-The Component Model's [goals] and intended [use cases] suggest the following
-additional goals and requirements for native async support:
-
-* Be independent-of but complementary-to the Core WebAssembly [stack-switching]
-  proposal; don't depend on this proposal being fully standard or implemented
-  (just like [JSPI]).
-* Be independent-of but complementary-to the Core WebAssembly
-  [shared-everything-threads] proposal; don't depend on this proposal being
-  fully standard or implemented and ensure that components can achieve a high
-  degree of concurrency using only one.
-* Avoid partitioning interfaces and components into separate strata; don't give
-  functions (or components) a [color].
-* Enable tight integration (e.g., automatic bindings generation) with a wide
-  variety of source languages' built-in concurrency features.
+Refining the Component Model's high-level [goals](../high-level/Goals.md) and
+[use cases](../high-level/UseCases.md), [WASI Preview 3] adds the following
+concurrency-specific goals and use cases:
+
+* Integrate with idiomatic source-language concurrency features including:
+  * `async` functions in languages like C#, JS, Python, Rust and Swift
+  * coroutines in languages like Kotlin, Perl, PHP and (recently) C++
+  * green threads scheduled by the language's own runtime in languages like Go
+    and (initially and recently again) Java
+  * host threads that are scheduled outside the language's own runtime in
+    languages like C, C++, C#, Python, Rust and many more that expose pthreads
+    or other OS threads
+  * promises, futures, streams and channels
+  * callbacks, in languages with no other built-in concurrency mechanisms
+* Provide [fiber]-like stack-switching capabilities via Core WebAssembly
+  import calls in a way that complements, but doesn't depend on, new Core
+  WebAssembly proposals including [stack-switching] and
+  [shared-everything-threads].
+* Allow polyfilling in browsers via JavaScript Promise Integration ([JSPI])
+* Avoid partitioning interfaces and components into separate ecosystems based
+  on degree of concurrency; don't give functions or components a "[color]".
 * Maintain meaningful cross-language call stacks (for the benefit of debugging,
   logging and tracing).
-* Provide mechanisms for applying and observing backpressure.
-* Allow non-reentrant synchronous and event-loop-driven core wasm code (that,
-  e.g., assumes a single global linear memory stack) to not have to worry about
+* Consider backpressure and cancellation as part of the design.
+* Allow non-reentrant synchronous and event-loop-driven core wasm code that
+  assumes a single global linear memory stack to not have to worry about
   additional reentrancy.
 
 
-## High-level Approach
-
-Based on the above goals, the Component Model's approach to native async starts
-by allowing components to import and export "async" functions which abstract
-over, and can be implemented by, idiomatic concurrency in a variety of
-programming languages:
-* `async` functions in languages like C#, JS, Python, Rust and Swift
-* stackful coroutines in languages like Kotlin, Perl, PHP and (recently) C++
-* green threads as-if running on a single OS thread in languages like Go and
-  (initially and recently again) Java
-* callbacks, in languages with no explicit async support
-
-The Component Model supports this wide variety of language features by
-specifying a common low-level "async" ABI which the different languages'
-compilers and runtimes can bind their different language-level concurrency
-features to. This is similar to how a native OS exposes APIs for concurrency
-(such as `epoll`, `io_uring`, `kqueue` or Overlapped I/O) to which most of
-these languages' concurrency features are already bound (making the Component
-Model "just another OS" from the language toolchains' perspective).
-
-Moreover, this async ABI does not require components to use preemptive
-multi-threading ([`thread.spawn*`]) in order to achieve concurrency. Instead,
-concurrency can be achieved by cooperatively switching between different
-logical tasks running on a single thread using [fibers] or a [CPS transform] in
-the wasm runtime as necessary.
-
-To avoid partitioning the world along sync/async lines as mentioned in the
-Goals section, the Component Model allows *every* component-level function type
-to be both implemented and called in either a synchronous or asynchronous
-manner. Thus, function types do not dictate synchrony or asynchrony and all 4
-combinations of {sync, async} x {caller, callee} are supported and given a
-well-defined behavior. Specifically, the caller and callee can independently
-specify `async` as an immediate flags on the [lift and lower definitions] used
-to define their imports and exports.
+## Summary
+
+To support the wide variety of language-level concurrency mechanisms listed
+above, the Component Model defines a new low-level, language-agnostic async
+calling convention (the "async ABI") for both calling into and calling out of
+Core WebAssembly. Language compilers and runtimes can bind to this async ABI in
+the same way that they already bind to various OS's concurrent I/O APIs (such
+as `select`, `epoll`, `io_uring`, `kqueue` and Overlapped I/O) making the
+Component Model "just another OS" from the language toolchain's perspective.
+
+The new async ABI can be used alongside or instead of the existing Preview 2
+"sync ABI" to call or implement *any* WIT function type, not just functions
+with specific signatures. This allows *all* function types to be called or
+implemented concurrently. When *calling* an imported function via the async
+ABI, if the callee blocks, control flow is returned immediately to the caller,
+and the callee resumes execution concurrently. When *implementing* an exported
+function via the async ABI, multiple concurrent export calls are allowed to
+be made by the caller. Critically, both sync-to-async and async-to-sync
+pairings have well-defined, composable behavior for both inter-component and
+intra-component calls, so that functions and components are not forced to pick
+a "[color]".
+
+Although Component Model function *types* are colorless, it can still be
+beneficial, especially in languages with `async`/`await`-style concurrency, to
+give the bindings generator a *hint* as to whether or not a particular function
+declared in WIT should appear as an `async` function in the generated bindings
+by default. Even in languages with colorless functions, developers and their
+tools can still benefit from such a hint when determining whether they want to
+call a particular imported function concurrently or not. To support these use
+cases, functions in WIT can be annotated with an `async` hint. E.g.:
+```wit
+interface http-handler {
+  use http-types.{request, response, error};
+  handle: async func(r: request) -> result<response, error>;
+}
+```
+Since `async` is just a hint, this `handle` function can be called using both
+the sync and async ABIs. Bindings generators can even generate both variants
+side-by-side, giving the developer the choice.
+
+Each time a component export is called, the wasm runtime logically spawns a new
+[green thread]  (as opposed to a [kernel thread]) to execute the export call
+concurrently with other calls in the runtime. This means that [thread-local
+storage](#thread-local-storage) is never reused between export calls and, in
+general, a caller's thread's identity is never observable to the callee. In
+some cases (such as when only sync ABI components are used) the runtime can
+statically, as an optimization, make a plain synchronous function call with the
+same wasm-observable behavior as-if it had created a new thread. But in
+general, when one component makes an async call that transitively blocks in
+another component, having the callee on its own native callstack is needed for
+the runtime to be able to switch back to the caller without having to unwind
+the stack.
+
+In addition to the *implicit* threads logically created for export calls, Core
+WebAssembly code can also *explicitly* create new green threads by calling the
+[`thread.new_indirect`] built-in. Regardless of how they were created, all
+threads can call a set of Component Model-defined `thread.*` built-in functions
+(listed [below](#waiting)) to suspend themselves and/or resume other threads.
+These built-ins provide sufficient functionality to implement both the
+internally-scheduled "green thread" and the externally-scheduled "host thread"
+use cases mentioned in the [goals](#goals).
+
+Until the Core WebAssembly [shared-everything-threads] proposal allows Core
+WebAssembly function types to be annotated with `shared`, `thread.new_indirect`
+can only call non-`shared` functions (via `i32` `(table funcref)` index, just
+like `call_indirect`) and thus currently all threads must execute
+[cooperatively] in a sequentially-interleaved fashion, switching between
+threads only at explicit program points just like (and implementable via) a
+traditional OS [fiber]. While these **cooperative threads** do not allow a
+single component instance to increase its internal parallelism, cooperative
+threads are still quite useful for getting existing threaded code to Just Work
+(as-if running on a single core) without the overhead of [CPS Transform]
+techniques like [Asyncify] and without depending on [shared-everything-threads].
+Moreover, in various embeddings, all available parallelism is already saturated
+by running independent component instances on separate kernel threads.
+
+Because new threads are (semantically, if not physically) created at all
+cross-component call boundaries, the degree of `shared` and non-`shared` thread
+use is kept an encapsulated implementation detail of a component (similar to
+the choice of linear vs. GC memory). This enables component authors to
+compatibly change their implementation strategy over time, starting simple and
+adding complexity for performance as needed over time.
 
 To provide wasm runtimes with additional optimization opportunities for
 languages with "stackless" concurrency (e.g. languages using `async`/`await`),
@@ -165,96 +210,81 @@ soon, GC) memory.
 
 ## Concepts
 
-The following concepts are defined as part of the Component Model's native
-async support.
-
-### Sync and Async Functions
-
-The distinction between sync and async functions does not appear in the
-component-level function type (nor in WIT). Rather, an "async" function is a
-component-level function that has been [lifted] from Core WebAssembly with the
-`async` option set. Symmetrically, a "sync" function is a component-level
-function that does not have the `async` option set (which is the default and
-only option prior to Preview 3). Thus, the sync/async distinction appears
-only independently in how a component-level function is *implemented* or
-*called*. This lack of distinction helps to avoid the classic ["What Color Is
-Your Function?"][Color] problem.
-
-Functions *may* however be annotated (in both WIT and component binaries) with
-`async` as a *hint*. This hint is intended to inform the default language
-bindings generation, indicating whether to use a source-level `async` function
-or not in languages that have such a distinction (e.g., JS, Python, C# and
-Rust). In the absence of such a hint, a bindings generator would be forced to
-make a uniform decision for what to do by default for all functions or require
-manual programmer directives. However, because `async` is just a hint, there is
-no prohibition against non-`async`-exported functions calling imported `async`
-functions. This does mean that non-`async` functions may end up blocking
-their caller, but (1) any loss in performance is the callee's "fault", (2) the
-caller can still lower `async` if they want to (overriding the default hint),
-(3) any *transitive* caller an lower `async` to avoid blocking.
-
-For example, given this interface:
-```wit
-interface filesystem {
-  resource file {
-    constructor();
-    is-closed: func() -> bool;
-    read: async func(num-bytes: u32) -> result<list<u8>>;
-    from-bytes: static func(bytes: list<u8>) -> file;
-    from-stream: static async func(bytes: stream<u8>) -> file;
-  }
-}
-```
-A bindings generator processing the above WIT for a language with `async` would
-only emit `async` functions for `read` and `from-stream`.
-
-Since in many languages `new` expressions cannot be async, there is no
-`async constructor`. Use cases requiring asynchronous construction can instead
-use `static async` functions, similar to `from-stream` in this example.
-
-### Task
-
-Every time a lifted function is called (e.g., when a component's export is
-called by the outside world), a new **task** is created that logically contains
-all the transitive control-flow state of the export call and will be destroyed
-when the export call finishes.
-
-When all of a component's exports are lifted synchronously, there will be at most one
-task alive at any one time. However, when a component exports asynchronously-lifted
-functions, there can be multiple tasks alive at once.
-
-In the Canonical ABI explainer, a "task" is represented with the Python
-[`Task`] class. A new `Task` object is created (by [`canon_lift`]) each time
-a component export is called.
+The following concepts are defined as part of the Component Model's concurrency
+support.
 
-### Current Task
+### Threads and Tasks
 
-At any point in time when executing in Core WebAssembly code, there is a
-well-defined **current task**. Thus, whenever a [canonical built-in] is called
-by Core WebAssembly code, it is meaningful for the built-in to work in terms
-"the current task".
-
-The "current task" is modelled in the Canonical ABI's Python code
-by implicitly threading the `Task` object created by [`canon_lift`] through all
-the Python functions transitively called by `canon_lift`. Thus, although there
-can be multiple live `Task` objects in a component instance, "the current one"
-is always clear: it's the one passed to the current function as a parameter.
-
-### Context-Local Storage
-
-Each task contains a distinct mutable **context-local storage** array. The
-current task's context-local storage can be read and written from core wasm
+As described in the [summary](#summary), each call to a component export
+logically creates a new ([green][green thread]) **thread** which, in many
+cases, can be optimized away and replaced with a synchronous function call.
+Each call to a component export also creates a new **task** that *contains*
+this new thread. Whereas a *thread* contains a callstack and other execution
+state, a *task* contains ABI bookkeeping state that is used to enforce the
+Canonical ABI's rules for export calls. Tasks are themselves contained by the
+component instance whose export was called. Thus, the overall containment
+relationship is:
+```
+Component Store
+  ↓ contains
+Component Instance
+  ↓ contains
+Task
+  ↓ contains
+Thread
+```
+where a **component store** is the top-level "thing" and analogous to a Core
+WebAssembly [store].
+
+The reason for the thread/task split is so that, when one thread creates a new
+thread by calling [`thread.new_indirect`], the new thread is contained by the
+task of the original thread. Thus there is an N:1 relationship between threads
+and tasks that ties N threads to the original export call (= "task") that
+transitively spawned those N threads. This relationship serves several purposes
+described in the following sections.
+
+In the Canonical ABI explainer, threads, tasks, component instances and
+component stores are represented by the [`Thread`], [`Task`],
+[`ComponentInstance`] and [`Store`] classes, resp.
+
+### Current Thread and Task
+
+At any point in time while executing Core WebAssembly code or a [canonical
+built-in] called by Core WebAssembly code, there is a well-defined **current
+thread** whose containing task is the **current task**. The "current thread" is
+modelled in the Canonical ABI's Python code by explicitly passing a [`Thread`]
+object as an argument to all function calls so that the semantic "current
+thread" is always the value of the `thread` parameter. Threads store their
+containing task so that the "current task" is always `thread.task`.
+
+### Thread-Local Storage
+
+Each thread contains a distinct mutable **thread-local storage** array. The
+current thread's thread-local storage can be read and written from core wasm
 code by calling the [`context.get`] and [`context.set`] built-ins.
 
-The context-local storage array's length is currently fixed to contain exactly
-1 `i32` with the goal of allowing this array to be stored inline in whatever
+The thread-local storage array's length is currently fixed to contain exactly
+2 `i32`s with the goal of allowing this array to be stored inline in whatever
 existing runtime data structure is already efficiently reachable from ambient
 compiled wasm code. Because module instantiation is declarative in the
 Component Model, the imported `context.{get,set}` built-ins can be inlined by
 the core wasm compiler as-if they were instructions, allowing the generated
-machine code to be a single load or store. This makes context-local storage a
-good place to store the pointer to the struct used to implement [thread-local
-storage] APIs used by guest code.
+machine code to be a single load or store. This makes thread-local storage a
+natural place to store:
+1. a pointer to the linear-memory "shadow stack" pointer
+2. a pointer to a struct used by the runtime to implement the language's
+   thread-local features
+
+When threads are created explicitly by `thread.new_indirect`, the lifetime of
+the thread-local storage array ends when the function passed to
+`thread.new_indirect` returns and thus any linear-memory allocations associated
+with the thread-local storage array should be eagerly freed by guest code right
+before returning. Similarly, since each call to an export logically creates a
+fresh thread, thread-local allocations can be eagerly released when this
+implicit thread exits by returning from the exported function or, if the
+stackless async ABI is used, returning the "exit" code to the event loop. This
+non-reuse of thread-local storage between distinct export calls avoids what
+would otherwise be a likely source of TLS-related memory leaks.
 
 When [memory64] is integrated into the Component Model's Canonical ABI,
 `context.{get,set}` will be backwards-compatibly relaxed to allow `i64`
@@ -262,27 +292,18 @@ pointers (overlaying the `i32` values like hardware 32/64-bit registers). When
 [wasm-gc] is integrated, these integral context values can serve as indices
 into guest-managed tables of typed GC references.
 
-When [threads are added](#interaction-with-multi-threading), each thread will
-also get its own distinct mutable context-local storage array. This is the
-reason why "context-local" storage is not called "task-local" storage (where a
-"context" is a finer-grained unit of execution than either a "task" or a
-"thread"). As part of this, the context-local storage array length will be
-increased to 2, allowing the linear-memory stack pointer to be moved from a
-core wasm `global` into context-local storage.
-
-Since the same mutable context-local storage cells are shared by all core wasm
-running under the same task/thread in the same component, the cells' contents
-must be carefully coordinated in the same way as native code has to carefully
+Since the same mutable thread-local storage cells are shared by all core wasm
+running under the same thread in the same component, the cells' contents must
+be carefully coordinated in the same way as native code has to carefully
 coordinate native ISA state (e.g., the [FS or GS segment base address]). In the
-common case, context-local storage is only `context.set` by the entry
-trampoline invoked by [`canon_lift`] and then all transitively reachable core
-wasm code (including from any `callback`) assumes `context.get` returns the
-same value. Thus, if any *non*-entry-trampoline code calls `context.set`, it is
-the responsibility of *that code* to restore this default assumption before
+common case, thread-local storage is only `context.set` by the entry trampoline
+invoked by [`canon_lift`] and then all transitively reachable core wasm code
+(including from any `callback`) assumes `context.get` returns the same value.
+Thus, if any *non*-entry-trampoline code calls `context.set`, it is the
+responsibility of *that code* to restore this default assumption before
 allowing control flow to escape into the wild.
 
-For details, see [`context.get`] in the AST explainer and [`canon_context_get`]
-in the Canonical ABI explainer.
+For more information, see [`context.get`] in the AST explainer.
 
 ### Structured concurrency
 
@@ -316,8 +337,10 @@ is the **supertask** relationship). Since a task+subtask pair have the same
 supertask, they can be thought of as a single node in the async call stack.
 
 A subtask/supertask relationship is immutably established when an import is
-called, setting the [current task](#current-task) as the supertask
-of the new subtask created for the import call.
+called, setting the [current task](#current-thread-and-task) as the supertask
+of the new subtask created for the import call. Thus, one reason for
+associating every thread with a "containing task" is to ensure that there is
+always a well-defined async call stack.
 
 A semantically-observable use of the async call stack is to distinguish between
 hazardous **recursive reentrance**, in which a component instance is reentered
@@ -334,22 +357,19 @@ Component Model's async call stack, linkage *between* different languages would
 be lost at component boundaries, leading to a loss of overall context in
 multi-component applications.
 
-There is an important nuance to the Component Model's minimal form of
-Structured Concurrency compared to Structured Concurrency support that appears
-in popular source language features/libraries. Often, "Structured Concurrency"
-refers to an invariant that all "child" tasks finish or are cancelled before a
-"parent" task completes. However, the Component Model doesn't force subtasks to
-[return](#returning) or be cancelled before the supertask returns (this is left
-as an option to particular source languages to enforce or not). The reason for
-not enforcing a stricter form of Structured Concurrency at the Component
-Model level is that there are important use cases where forcing a supertask to
-stay resident simply to wait for a subtask to finish would waste resources
-without tangible benefit. Instead, we can say that once the core wasm
-implementing a supertask finishes execution, the supertask semantically "tail
-calls" any still-live subtasks, staying technically-alive until they complete,
-but not consuming real resources. Concretely, this means that a supertask that
-finishes executing stays on the callstack of any still-executing subtasks for
-the abovementioned purposes until all transitive subtasks finish.
+There is an important gap between the Component Model's minimal form of
+Structured Concurrency and the Structured Concurrency support that appears in
+popular source language features/libraries. Often, "[Structured Concurrency]"
+refers to an invariant that all "child tasks" finish or are cancelled before a
+"parent task" completes. However, the Component Model doesn't force a subtask's
+threads to all return before the supertask's threads all return. The reason for
+not enforcing a stricter form of Structured Concurrency at the Component Model
+level is that there are important use cases where forcing a supertask's thread
+to stay resident just to wait for subtasks to finish would waste resources
+without tangible benefit. Instead, we can say that once a supertask's last
+thread finishes execution, the supertask semantically "tail calls" any still-
+executing subtasks, staying technically-alive and on the async call stack until
+they complete, but not consuming real resources.
 
 For scenarios where one component wants to *non-cooperatively* put an upper
 bound on execution of a call into another component, a separate "[blast zone]"
@@ -474,53 +494,75 @@ buffering between the readable and writable ends.
 ### Waiting
 
 When a component asynchronously lowers an import, it is explicitly requesting
-that, if the import blocks, control flow be returned back to the calling task
+that, if the import blocks, control flow be returned back to the calling thread
 so that it can do something else. Similarly, if `stream.read` or `stream.write`
-are called asynchronously and would block, they return a "blocked" code so that
-the caller can continue to make progress on other things. But eventually, a
-task will run out of other things to do and will need to **wait** for progress
-on one of the task's subtasks, reads or writes, which are collectively called
-its **waitables**. The Canonical ABI Python represents waitables with the
-[`Waitable`] base class. While a task is waiting, the Component Model runtime
-can switch to other running tasks or start new tasks by invoking exports.
-
-To avoid the O(N) cost of processing an N-ary list of waitables every time a
-task needs to wait (which is the classic performance bottleneck of, e.g., POSIX
-`select()`), the Canonical ABI allows waitables to be maintained in **waitable
-sets** which (like `epoll()`) can be waited upon as a whole for any one of the
-member waitables to make progress. Waitable sets are independent of tasks;
-tasks can wait on different waitable sets over time and a single waitable set
-can be waited upon by multiple tasks at once. Waitable sets are local to a
-component instance and cannot be shared across component boundaries.
-
-The Canonical ABI provides two ways for a task to wait on a waitable set:
-* Core wasm can pass (the index of) the waitable set as a parameter to the
-  [`waitable-set.wait`] built-in which blocks and returns the event that
-  occurred.
-* If the task uses a `callback` function, core wasm can return (the index of)
-  the waitable set as a return value to the event loop, which will block and
-  then pass the event that occurred as a parameter to the `callback`.
-
-While the two approaches have significant runtime implementation differences,
-semantically they do the same thing which, in the Canonical ABI Python code, is
-factored out into the [`Task.wait_for_event`] method. Thus, the difference between
-`callback` and non-`callback` is one of optimization (as described
-[above](#high-level-approach)), not expressivity.
-
-In addition to waiting for an event to occur, a task can also **poll** for
-whether an event has already occurred. Polling does not block, but does allow
-other tasks to be switched to and executed. Polling is opportunistic, allowing
-the servicing of higher-priority events in the middle of longer-running
-computations; when there is nothing left to do, a task must *wait*. A task
-can poll by either calling [`waitable-set.poll`] or, when using a
-`callback`, by returning the Canonical-ABI-defined "poll" code to the event loop
-along with (the index of) the waitable set to poll.
-
-Lastly, if a long-running task wants to allow other tasks to execute, without
-having any of its own subtasks to wait on, it can **yield**, allowing other
-tasks to be scheduled before continuing execution of the current task. A task
-can yield by either calling [`yield`] or, when using a `callback`, by returning
-the Canonical-ABI-defined "yield" code to the event loop.
+are called asynchronously and block, they return a "blocked" code so that the
+caller can continue to make progress on other things. But eventually, a thread
+will run out of other things to do and will need to wait for something else to
+happen by **suspending** itself until something else happens.
+
+The following three built-ins put threads into a suspended state:
+* [`thread.new_indirect`]: create a new thread that is initially suspended
+  and continue executing the current thread
+* [`thread.switch-to`]: suspend the current thread and immediately resume a
+  given thread
+* [`thread.suspend`]: suspend the current thread and resume any transitive
+  async caller on the stack
+
+These built-ins enable "green thread" [use cases](#goals), allowing the
+language's runtime (compiled to wasm) to deterministically control which thread
+executes when.
+
+The following three built-ins can additionally be called to
+nondeterministically resume a thread at some point in the future (allowing the
+embedder to use whatever scheduler heuristics based on, e.g., timing and
+priority):
+
+* [`thread.resume-later`]: resume a given thread at some point in the future
+  and continue executing in the current thread
+* [`thread.yield-to`]: immediately resume a given thread and resume the current
+  thread at some point in the future
+* [`thread.yield`]: immediately resume *some* (nondeterministically-selected)
+  other thread and resume the current thread at some point in the future
+
+These built-ins enable the "host thread" [use cases](#goals), allowing the
+embedder to nondeterministically control which thread is resumed when. In
+particular, [`pthread_create`] can be implemented using `thread.new_indirect`
+and either `thread.resume-later` or `thread.yield-to` (thereby allowing the
+pthreads implementation to choose whether to execute a new pthread eagerly or
+not).
+
+Additionally, a thread may need to wait for progress to be made on an async
+subtask or stream/future read/write in progress. Subtasks and readable/writable
+ends of streams/futures are collectively called **waitables** and can be put
+into **waitable sets** which a thread can wait on. Waitable sets avoid the O(N)
+cost of passing and examining a list of waitables every time a thread needs to
+wait for progress in the same manner as, e.g., `epoll`.
+
+In particular, the following built-ins allow building and using waitable sets:
+* [`waitable-set.new`]: return a new empty waitable set
+* [`waitable.join`]: add, move, or remove a given waitable to/from a given
+  waitable set
+* [`waitable-set.wait`]: suspend until one of the waitables in the given set
+  has a pending event and then return that event
+* [`waitable-set.poll`]: first `thread.yield` and, once resumed, if any of the
+  waitables in the given set has a pending event, return that event; otherwise
+  return a sentinel "none" value
+
+Threads that are explicitly suspended (via `thread.new_indirect`,
+`thread.switch-to` or `thread.suspend`) will stay suspended indefinitely until
+explicitly resumed (via `thread.switch-to`, `thread.resume-later`,
+`thread.yield-to`). Attempting to explicitly resume a thread that was *not*
+explicitly suspended by one of these three built-ins traps. For example,
+attempting to `thread.resume-later` a thread waiting on `waitable-set.wait` or
+a synchronous import call will trap. Thus, language runtimes and compilers have
+to be careful when using a mix of explicit and implicit suspension/resumption.
+
+Lastly, when an async function is implemented using the `callback` suboption
+(mentioned in the [summary](#summary)), instead of calling `wait`, `poll` or
+`yield`, as an optimization, the `callback` function can *return* to wait in
+the event loop, minimizing switching costs and freeing up the stack in the
+interim.
 
 ### Backpressure
 
@@ -568,7 +610,14 @@ loop interleaving `stream.read`s (of the readable end passed for `in`) and
 `stream.write`s (of the writable end it `stream.new`ed) before exiting the
 task.
 
-Once `task.return` is called, the task is in the "returned" state.
+*Any* of the threads contained by a task can call `task.return`; there is no
+"main thread" of a task. When the last thread of a task returns, there is a
+trap if `task.return` has not been called. Thus, *some* thread (either the
+thread created implicitly for the initial export call or some thread
+transitively created by that thread) must call `task.return`.
+
+Once `task.return` is called, the task is in the "returned" state. Calling
+`task.return` when not in the "started" state traps.
 
 ### Borrows
 
@@ -614,32 +663,43 @@ cancelled-before-started and cancelled-before-returned. A subtask in one of
 these terminal states is said to be **resolved**. A resolved subtask has always
 dropped all the borrowed handles that it was lent during the call.
 
-As with the rest of async, cancellation is *cooperative*, allowing the subtask
-a chance to execute and clean up before it transitions to a resolved state (and
-relinquishes its borrowed handles). Since there are valid use cases where
-successful cancellation requires performing additional I/O using borrowed
-handles and potentially blocking in the process, the Component Model does not
-impose any limits on what a subtask can do after receiving a cancellation
-request nor is there a non-cooperative option to force termination (instead,
-this functionality would come as part of a future "[blast zone]" feature).
-Thus, the `subtask.cancel` built-in can block and works just like an import
-call in that it can be called synchronously or asynchronously.
-
-On the callee side of cancellation: when a caller requests cancellation via
-`subtask.cancel`, the callee receives a [`TASK_CANCELLED`] event (as produced
-by one of the `waitable-set.{wait,poll}` or `yield` built-ins or as received by
-the `callback` function). Upon receiving notice of cancellation, the callee can
-call the [`task.cancel`] built-in to resolve the subtask without returning a
-value. Alternatively, the callee can still call [`task.return`] as-if there
-were no cancellation. `task.cancel` doesn't take a value to return but does
-enforce the same [borrow](#borrows) rules as `task.return`. Ideally, a callee
-will `task.cancel` itself as soon as possible after receiving a
-`TASK_CANCELLED` event so that any caller waiting for the recovery of lent
-handles is unblocked ASAP. As with `task.return`, after calling `task.cancel`,
-a callee can continue executing before exiting the task.
-
-See the [`canon_subtask_cancel`] and [`canon_task_cancel`] functions in the
-Canonical ABI explainer for more details.
+Cancellation is *cooperative*, delivering the request for cancellation to one
+of the subtask's threads and then allowing the subtask to continue executing
+for an arbitrary amount of time (calling imports, performing I/O and everything
+else) until the subtask decides to call [`task.cancel`] to confirm the
+cancellation or, for whatever reason, call `task.return` as-if there had been
+no cancellation. `task.cancel` enforces the same "all borrowed handles dropped"
+rule as `task.return`, so that once a subtask is resolved, the caller knows its
+lent handles have been returned. If the subtask was waiting to start due to
+backpressure, the subtask is immediately aborted without running the callee at
+all.
+
+When `subtask.cancel` is called, it will attempt to immediately resume one of
+the subtask's threads which is in a cancellable state, passing it a sentinel
+"cancelled" value. A thread is in a "cancellable" state if it calls one of the
+[waiting](#waiting) built-ins with the `cancellable` immediate set (indicating
+that the caller expects and propagates cancellation appropriately) or, if using
+a `callback`, returns to the event loop (which always waits cancellably). If a
+subtask has no cancellable threads, no thread is resumed and the request for
+cancellation is remembered in the task state, to be delivered immediately at
+the next cancellable wait. In the worst case, though, a component may never
+wait cancellably and thus cancellation may be silently ignored.
+
+`subtask.cancel` can be called synchronously or asynchronously. If called
+synchronously, `subtask.cancel` waits until the subtask reaches a resolved
+state and returns which state was reached. If called asynchronously, then if a
+cancellable subtask thread is resumed *and* the subtask reaches a resolved
+state before suspending itself for whatever reason `subtask.cancel` will return
+which state was reached. Otherwise, `subtask.cancel` will return a "blocked"
+sentinel value and the caller must [wait][waiting] via waitable set until the
+subtask reaches a resolved state.
+
+The Component Model does not provide a mechanism to force prompt termination of
+threads as this can lead to leaks and corrupt state in a still-live component
+instance. In the future, prompt termination could be added as part of a "[blast
+zone]" feature that promptly destroys whole component instances, automatically
+dropping all handles held by the destroyed instance, thereby avoiding the leak
+and corruption hazards.
 
 ### Nondeterminism
 
@@ -671,24 +731,26 @@ defined by the Component Model:
 * If there are multiple waitables with a pending event in a waitable set that
   is being waited on or polled, there is a nondeterministic choice of which
   waitable's event is delivered first.
-* If multiple tasks wait on or poll the same waitable set at the same time,
-  the distribution of events to tasks is nondeterministic.
-* If multiple tasks that previously blocked are unblocked at the same time, the
-  sequential order in which they are executed is nondeterministic.
-* Whenever a task yields or waits on (or polls) a waitable set with an already
-  pending event, whether the task "blocks" and transfers execution to its async
-  caller is nondeterministic.
-* If multiple tasks are waiting on [backpressure](#backpressure), and the
-  backpressure is disabled, the order in which these pending tasks (and new
-  tasks started while there are still pending tasks) start is nondeterministic.
+* If multiple threads wait on or poll the same waitable set at the same time,
+  the distribution of events to threads is nondeterministic.
+* Whenever a thread yields or waits on (or polls) a waitable set with an already
+  pending event, whether the thread suspends and transfers execution to an
+  async caller is nondeterministic.
+* If multiple threads that previously suspended can be resumed at the same
+  time, the order in which they are resumed is nondeterministic.
+* If multiple tasks are blocked by backpressure and the backpressure is
+  disabled, the order in which these pending tasks start, along with how
+  they interleave with new tasks, is nondeterministic.
+* If a task containing multiple threads is cancelled, the choice of which
+  thread receives the request for cancellation is nondeterministic.
 
 Despite the above, the following scenarios do behave deterministically:
 * If a component `a` asynchronously calls the export of another component `b`,
   control flow deterministically transfers to `b` and then back to `a` when
-  `b` returns or blocks.
+  `b` returns or suspends.
 * If a component `a` asynchronously cancels a subtask in another component `b`,
   control flow deterministically transfers to `b` and then back to `a` when `b`
-  resolves or blocks.
+  resolves or suspends.
 * If a component `a` asynchronously cancels a subtask in another component `b`
   that was blocked before starting due to backpressure, cancellation completes
   deterministically and immediately.
@@ -738,42 +800,6 @@ concurrently, relaxing the fully synchronous model of instantiation supported
 by declarative instantiation and `start` above.
 
 
-## Interaction with multi-threading
-
-For now, the integration between multi-threading (via [`thread.spawn*`]) and
-native async is limited. In particular, because all [lift and lower definitions]
-produce non-`shared` functions, any threads spawned by a component via
-`thread.spawn*` will not be able to directly call imports (synchronously *or*
-asynchronously) and will thus have to use Core WebAssembly `atomics.*`
-instructions to switch back to a non-`shared` function running on the "main"
-thread (i.e., whichever thread was used to call the component's exports).
-
-However, a future addition to this proposal (in the [TODO](#todo)s below) would
-be to allow lifting and lowering with `async` + `shared`. What's exciting about
-this approach is that a non-`shared` component-level function could be safely
-lowered with `async shared`. In the case that the lifted function being lowered
-was also `async shared`, the entire call could happen on the non-main thread
-without a context switch. But if the lifting side was non-`shared`, then the
-Component Model could automatically handle the synchronization of enqueuing a
-call to the export (as in the backpressure case mentioned above), returning a
-subtask for the async caller to wait on as usual. Thus, the sync+async
-composition story described above could naturally be extended to a
-sync+async+shared composition story, continuing to avoid the "what color is
-your function" problem (where `shared` is the [color]).
-
-Even without any use of [`thread.spawn*`], native async provides an opportunity
-to achieve some automatic parallelism "for free". In particular, due to the
-shared-nothing nature of components, each component instance could be given a
-separate thread on which to interleave all tasks executing in that instance.
-Thus, in a cross-component call from `C1` to `C2`, `C2`'s task can run in a
-separate thread that is automatically synchronized with `C1` by the runtime.
-This is analogous to how traditional OS processes can run in separate threads,
-except that the component model is *allowing*, but not *requiring* the separate
-threads. While it's unclear how much parallelism this would unlock in practice,
-it does present interesting opportunities to experiment with optimizations over
-time as applications are built with more components.
-
-
 ## Async ABI
 
 At an ABI level, native async in the Component Model defines for every WIT
@@ -783,9 +809,9 @@ signature. This async-oriented core function signature is intended to be called
 or implemented by generated bindings which then map the low-level core async
 protocol to the languages' higher-level native concurrency features. Because
 the WIT-level `async` attribute is purely a *hint* (as mentioned
-[above](#sync-and-async-functions)), *every* WIT function has an async core
-function signature; `async` just provides hints to the bindings generator for
-which to use by default.
+[above](#summary)), *every* WIT function has an async core function signature;
+`async` just provides hints to the bindings generator for which to use by
+default.
 
 ### Async Import ABI
 
@@ -858,7 +884,7 @@ func(s1: stream<future<string>>, s2: list<stream<string>>) -> result<stream<stri
 ```
 In *both* the sync and async ABIs, a `future` or `stream` in the WIT-level type
 translates to a single `i32` in the ABI.  This `i32` is an index into the
-current component instance's table. For example, for the WIT function type:
+current component instance's handle table. For example, for the WIT function type:
 ```wit
 func(f: future<string>) -> future<u32>
 ```
@@ -1184,18 +1210,23 @@ comes after:
   `shared`
 
 
-[Wasm I/O 2024 presentation]: https://www.youtube.com/watch?v=y3x4-nQeXxc
+[wasmio-2024]: https://www.youtube.com/watch?v=y3x4-nQeXxc
+[wasmio-2025]: https://www.youtube.com/watch?v=mkkYNw8gTQg
+
 [Color]: https://journal.stuffwithstuff.com/2015/02/01/what-color-is-your-function/
+[Weak Memory Model]: https://people.mpi-sws.org/~rossberg/papers/Watt,%20Rossberg,%20Pichon-Pharabod%20-%20Weakening%20WebAssembly%20[Extended].pdf
 
-[Fibers]: https://en.wikipedia.org/wiki/Fiber_(computer_science)
+[Fiber]: https://en.wikipedia.org/wiki/Fiber_(computer_science)
 [CPS Transform]: https://en.wikipedia.org/wiki/Continuation-passing_style
-[Event Loop]: https://en.wikipedia.org/wiki/Event_loop
-[Structured Concurrency]: https://en.wikipedia.org/wiki/Structured_concurrency
+[Asyncify]: https://emscripten.org/docs/porting/asyncify.html
 [Session Types]: https://en.wikipedia.org/wiki/Session_type
+[Structured Concurrency]: https://en.wikipedia.org/wiki/Structured_concurrency
 [Unit]: https://en.wikipedia.org/wiki/Unit_type
-[Thread-local Storage]: https://en.wikipedia.org/wiki/Thread-local_storage
 [FS or GS Segment Base Address]: https://docs.kernel.org/arch/x86/x86_64/fsgs.html
+[Green Thread]: https://en.wikipedia.org/wiki/Green_thread
+[Kernel Thread]: https://en.wikipedia.org/wiki/Thread_(computing)#kernel_thread
 [Cooperative]: https://en.wikipedia.org/wiki/Cooperative_multitasking
+[Cooperatively]: https://en.wikipedia.org/wiki/Cooperative_multitasking
 [Multithreading]: https://en.wikipedia.org/wiki/Multithreading_(computer_architecture)
 [Overlapped I/O]: https://en.wikipedia.org/wiki/Overlapped_I/O
 [`io_uring`]: https://en.wikipedia.org/wiki/Io_uring
@@ -1203,10 +1234,9 @@ comes after:
 
 [`select`]: https://pubs.opengroup.org/onlinepubs/007908799/xsh/select.html
 [`O_NONBLOCK`]: https://pubs.opengroup.org/onlinepubs/7908799/xsh/open.html
+[`pthread_create`]: https://pubs.opengroup.org/onlinepubs/7908799/xsh/pthread_create.html
 
 [AST Explainer]: Explainer.md
-[Lift and Lower Definitions]: Explainer.md#canonical-definitions
-[Lifted]: Explainer.md#canonical-definitions
 [Canonical Built-in]: Explainer.md#canonical-built-ins
 [`context.get`]: Explainer.md#-contextget
 [`context.set`]: Explainer.md#-contextset
@@ -1215,39 +1245,34 @@ comes after:
 [`task.return`]: Explainer.md#-taskreturn
 [`task.cancel`]: Explainer.md#-taskcancel
 [`subtask.cancel`]: Explainer.md#-subtaskcancel
-[`yield`]: Explainer.md#-yield
+[`waitable-set.new`]: Explainer.md#-waitable-setnew
 [`waitable-set.wait`]: Explainer.md#-waitable-setwait
 [`waitable-set.poll`]: Explainer.md#-waitable-setpoll
-[`thread.spawn*`]: Explainer.md#-threadspawn_ref
+[`waitable.join`]: Explainer.md#-waitablejoin
+[`thread.new_indirect`]: Explainer.md#-threadnew_indirect
+[`thread.index`]: Explainer.md#-threadindex
+[`thread.suspend`]: Explainer.md#-threadsuspend
+[`thread.switch-to`]: Explainer.md#-threadswitch-to
+[`thread.resume-later`]: Explainer.md#-threadresume-later
+[`thread.yield-to`]: Explainer.md#-threadyield-to
+[`thread.yield`]: Explainer.md#-threadyield
 [`{stream,future}.new`]: Explainer.md#-streamnew-and-futurenew
 [`{stream,future}.{read,write}`]: Explainer.md#-streamread-and-streamwrite
 [`stream.cancel-write`]: Explainer.md#-streamcancel-read-streamcancel-write-futurecancel-read-and-futurecancel-write
-[ESM-integration]: Explainer.md#ESM-integration
 
 [Canonical ABI Explainer]: CanonicalABI.md
-[ABI Options]: CanonicalABI.md#canonical-abi-options
 [`canon_lift`]: CanonicalABI.md#canon-lift
 [`unpack_callback_result`]: CanonicalABI.md#canon-lift
 [`canon_lower`]: CanonicalABI.md#canon-lower
-[`canon_context_get`]: CanonicalABI.md#-canon-contextget
-[`canon_waitable_set_wait`]: CanonicalABI.md#-canon-waitable-setwait
-[`canon_task_return`]: CanonicalABI.md#-canon-taskreturn
-[`canon_task_cancel`]: CanonicalABI.md#-canon-taskcancel
-[`canon_subtask_cancel`]: CanonicalABI.md#-canon-subtaskcancel
-[`Task`]: CanonicalABI.md#task-state
-[`Task.enter`]: CanonicalABI.md#task-state
-[`Task.wait_for_event`]: CanonicalABI.md#task-state
-[`Waitable`]: CanonicalABI.md#waitable-state
-[`TASK_CANCELLED`]: CanonicalABI.md#waitable-state
+[`Store`]: CanonicalABI.md#embedding
+[`ComponentInstance`]: CanonicalABI.md#component-instance-state
+[`Thread`]: CanonicalABI.md#thread-state
 [`Task`]: CanonicalABI.md#task-state
-[`Subtask`]: CanonicalABI.md#subtask-state
 [Stream State]: CanonicalABI.md#stream-state
 [Future State]: CanonicalABI.md#future-state
 
 [Binary Format]: Binary.md
 [WIT]: WIT.md
-[Goals]: ../high-level/Goals.md
-[Use Cases]: ../high-level/UseCases.md
 [Blast Zone]: FutureFeatures.md#blast-zones
 [Reentrance]: Explainer.md#component-invariants
 [`start`]: Explainer.md#start-definitions
@@ -1263,7 +1288,6 @@ comes after:
 [wasi-libc]: https://github.com/WebAssembly/wasi-libc
 
 [WASI Preview 3]: https://github.com/WebAssembly/WASI/tree/main/wasip2#looking-forward-to-preview-3
-[`wasi:http/handler.handle`]: https://github.com/WebAssembly/wasi-http/blob/main/wit-0.3.0-draft/handler.wit
 [Runtime Instantiation]: https://github.com/WebAssembly/component-model/issues/423
 
 [Top-level `await`]: https://github.com/tc39/proposal-top-level-await
diff --git a/design/mvp/Explainer.md b/design/mvp/Explainer.md
index b00b2407..6b2e7370 100644
--- a/design/mvp/Explainer.md
+++ b/design/mvp/Explainer.md
@@ -26,9 +26,8 @@ more user-focused explanation, take a look at the
     * [Canonical ABI](#canonical-abi)
     * [Canonical built-ins](#canonical-built-ins)
       * [Resource built-ins](#resource-built-ins)
-      * [Async built-ins](#-async-built-ins)
+      * [Concurrency built-ins](#-concurrency-built-ins)
       * [Error Context built-ins](#-error-context-built-ins)
-      * [Threading built-ins](#-threading-built-ins)
   * [Value definitions](#-value-definitions)
   * [Start definitions](#-start-definitions)
   * [Import and export definitions](#import-and-export-definitions)
@@ -54,6 +53,7 @@ implemented, considered stable and included in a future milestone:
   * 🚝: marking some builtins as `async`
   * 🚟: using `async` with `canon lift` without `callback` (stackful lift)
 * 🧵: threading built-ins
+  * 🧵②: [shared-everything-threads]-based threading built-ins
 * 🔧: fixed-length lists
 * 📝: the `error-context` type
 * 🔗: canonical interface names
@@ -737,7 +737,7 @@ are useful for:
 
 A `future<T>` asynchronously delivers exactly one `T` value from a source to a
 destination, unless the destination signals that it doesn't want the `T` value
-any more. Because all imports can be [called asynchronously](Async.md), futures
+any more. Because [all imports can be called asynchronously][summary], futures
 are not necessary to express a traditional `async` function -- all functions
 are effectively `async`. Instead futures are useful in more advanced scenarios
 where a parameter or result value may not be ready at the same time as the
@@ -1319,8 +1319,8 @@ and empty results.
 🔀 The `async` option specifies that the component wants to make (for imports)
 or support (for exports) multiple concurrent (asynchronous) calls. This option
 can be applied to any component-level function type and changes the derived
-Canonical ABI significantly. See the [async explainer] for more details. When
-a function signature contains a `future` or `stream`, validation of `canon
+Canonical ABI significantly. See the [concurrency explainer] for more details.
+When a function signature contains a `future` or `stream`, validation of `canon
 lower` requires the `async` option to be set (since a synchronous call to a
 function using these types is highly likely to deadlock).
 
@@ -1333,7 +1333,7 @@ validated to have the following core function type:
       (param $payload i32)
       (result $done i32))
 ```
-Again, see the [async explainer] for more details.
+Again, see the [concurrency explainer] for more details.
 
 Based on this description of the AST, the [Canonical ABI explainer] gives a
 detailed walkthrough of the static and dynamic semantics of `lift` and `lower`.
@@ -1408,7 +1408,7 @@ In addition to the `lift` and `lower` canonical function definitions which
 adapt *existing* functions, there are also a set of canonical "built-ins" that
 define core functions out of nothing that can be imported by core modules to
 dynamically interact with Canonical ABI entities like resources,
-[tasks, subtasks, waitable sets, streams and futures](Async.md).
+[threads, tasks, subtasks, waitable sets, streams and futures](Concurrency.md).
 ```ebnf
 canon ::= ...
         | (canon resource.new <typeidx> (core func <id>?))
@@ -1422,7 +1422,7 @@ canon ::= ...
         | (canon backpressure.dec (core func <id>?)) 🔀
         | (canon task.return (result <valtype>)? <canonopt>* (core func <id>?)) 🔀
         | (canon task.cancel (core func <id>?)) 🔀
-        | (canon yield cancellable? (core func <id>?)) 🔀
+        | (canon yield cancellable? (core func <id>?)) 🔀❌ (renamed to 'thread.yield')
         | (canon waitable-set.new (core func <id>?)) 🔀
         | (canon waitable-set.wait cancellable? (memory <core:memidx>) (core func <id>?)) 🔀
         | (canon waitable-set.poll cancellable? (memory <core:memidx>) (core func <id>?)) 🔀
@@ -1444,12 +1444,19 @@ canon ::= ...
         | (canon future.cancel-write <typeidx> async? (core func <id>?)) 🔀
         | (canon future.drop-readable <typeidx> (core func <id>?)) 🔀
         | (canon future.drop-writable <typeidx> (core func <id>?)) 🔀
+        | (canon thread.index (core func <id>?)) 🧵
+        | (canon thread.new_indirect <typeidx> <core:tableidx> (core func <id>?)) 🧵
+        | (canon thread.switch-to cancellable? (core func <id>?)) 🧵
+        | (canon thread.suspend cancellable? (core func <id>?)) 🧵
+        | (canon thread.resume-later (core func <id>?) 🧵
+        | (canon thread.yield-to cancellable? (core func <id>?) 🧵
+        | (canon thread.yield cancellable? (core func <id>?) 🧵
         | (canon error-context.new <canonopt>* (core func <id>?)) 📝
         | (canon error-context.debug-message <canonopt>* (core func <id>?)) 📝
         | (canon error-context.drop (core func <id>?)) 📝
-        | (canon thread.spawn_ref <typeidx> (core func <id>?)) 🧵
-        | (canon thread.spawn_indirect <typeidx> <core:tableidx> (core func <id>?)) 🧵
-        | (canon thread.available_parallelism (core func <id>?)) 🧵
+        | (canon thread.spawn_ref shared? <typeidx> (core func <id>?)) 🧵②
+        | (canon thread.spawn_indirect shared? <typeidx> <core:tableidx> (core func <id>?)) 🧵②
+        | (canon thread.available-parallelism (core func <id>?)) 🧵②
 ```
 
 ##### Resource built-ins
@@ -1550,9 +1557,9 @@ transferring ownership of the newly-created resource to the export's caller.
 
 For details, see [`canon_resource_rep`] in the Canonical ABI explainer.
 
-##### 🔀 Async built-ins
+##### 🔀🧵 Concurrency built-ins
 
-See the [async explainer] for background.
+See the [concurrency explainer] for background.
 
 ###### 🔀 `context.get`
 
@@ -1561,10 +1568,10 @@ See the [async explainer] for background.
 | Approximate WIT signature  | `func<T,i>() -> T` |
 | Canonical ABI signature    | `[] -> [i32]`      |
 
-The `context.get` built-in returns the `i`th element of the [current task]'s
-[context-local storage] array. Validation currently restricts `i` to be less
-than 1 and `t` to be `i32`, but will be relaxed in the future (as described
-[here][context-local storage]).
+The `context.get` built-in returns the `i`th element of the [current thread]'s
+[thread-local storage] array. Validation currently restricts `i` to be less
+than 2 and `t` to be `i32`, but these restrictions may be relaxed in the
+future.
 
 For details, see [`canon_context_get`] in the Canonical ABI explainer.
 
@@ -1575,10 +1582,10 @@ For details, see [`canon_context_get`] in the Canonical ABI explainer.
 | Approximate WIT signature  | `func<T,i>(v: T)` |
 | Canonical ABI signature    | `[i32] -> []`     |
 
-The `context.set` built-in sets the `i`th element of the [current task]'s
-[context-local storage] array to the value `v`. Validation currently
-restricts `i` to be less than 1 and `t` to be `i32`, but will be relaxed in the
-future (as described [here][context-local storage]).
+The `context.set` built-in sets the `i`th element of the [current thread]'s
+[thread-local storage] array to the value `v`. Validation currently restricts
+`i` to be less than 2 and `t` to be `i32`, but these restrictions may be
+relaxed in the future.
 
 For details, see [`canon_context_set`] in the Canonical ABI explainer.
 
@@ -1621,15 +1628,22 @@ explainer.
 
 ###### 🔀 `task.return`
 
+| Synopsis                   |                                         |
+| -------------------------- | --------------------------------------- |
+| Approximate WIT signature  | `func<FuncT>(results: FuncT.results)`   |
+| Canonical ABI signature    | `[lower(FuncT.results)*] -> []`         |
+
 The `task.return` built-in takes as parameters the result values of the
-currently-executing task. This built-in must be called exactly once per export
-activation. The `canon task.return` definition takes component-level return
-type and the list of `canonopt` to be used to lift the return value. When
-called, the declared return type and the `string-encoding` and `memory`
-`canonopt`s are checked to exactly match those of the current task.
+[current task]. One of `task.return` or `task.cancel` must be called exactly
+once from any of a task's threads.
+
+The `canon task.return` definition takes component-level return type and the
+list of `canonopt` to be used to lift the return value. When called, the
+declared return type and the `string-encoding` and `memory` `canonopt`s are
+checked to exactly match those of the current task.
 
-For details, see [Returning] in the async explainer and [`canon_task_return`]
-in the Canonical ABI explainer.
+For details, see [Returning] in the concurrency explainer and
+[`canon_task_return`] in the Canonical ABI explainer.
 
 ###### 🔀 `task.cancel`
 
@@ -1641,35 +1655,14 @@ in the Canonical ABI explainer.
 The `task.cancel` built-in indicates that the [current task] is now [resolved]
 and has dropped all borrowed handles lent to it during the call (trapping if
 otherwise). `task.cancel` can only be called after the `task-cancelled` event
-has been received (via `callback`, `waitable-set.{wait,poll}` or `yield`) to
-indicate that the supertask has requested cancellation and thus is not
-expecting a return value.
+has been received (via `callback`, `waitable-set.{wait,poll}` or `thread.*`)
+to indicate that the supertask has requested cancellation and thus is not
+expecting a return value. Once this request is received, any of the task's
+threads can call `task.cancel` or `task.return`.
 
-For details, see [Cancellation] in the async explainer and
+For details, see [Cancellation] in the concurrency explainer and
 [`canon_task_cancel`] in the Canonical ABI explainer.
 
-###### 🔀 `yield`
-
-| Synopsis                   |                                |
-| -------------------------- | ------------------------------ |
-| Approximate WIT signature  | `func<cancellable?>() -> bool` |
-| Canonical ABI signature    | `[] -> [i32]`                  |
-
-The `yield` built-in allows the runtime to switch to other tasks, enabling a
-long-running computation to cooperatively interleave execution. If `yield` is
-called from a synchronous- or `async callback`-lifted export, no other
-synchronous or `async callback`-lifted tasks can start or progress in the
-current component instance (ensuring non-reentrance of the core wasm code).
-However, non-`callback` `async`-lifted ("stackful async") exports may start
-or progress at any time.
-
-If `cancellable` is set, `yield` may return `true` (`1`) if the caller requests
-[cancellation] of the [current task]. If `cancellable` is not set, the return
-value is always `false` (`0`). Cancellation is returned at most once for a
-given task and thus must be propagated once received.
-
-For details, see [`canon_yield`] in the Canonical ABI explainer.
-
 ###### 🔀 `waitable-set.new`
 
 | Synopsis                   |                          |
@@ -1711,19 +1704,15 @@ enum subtask-state {
     cancelled-before-returned,
 }
 ```
-The `waitable-set.wait` built-in waits for any one of the [waitables] in the
-given [waitable set] `s` to make progress and then returns an `event`
-describing the event. The `event` `none` is never returned. Waitable sets
-may be `wait`ed upon when empty, in which case the caller will necessarily
-block until another task adds a waitable to the set that can make progress.
+The `waitable-set.wait` built-in [suspends][waiting] the [current thread] in
+a "pending" state until any one of the [waitables] in the given [waitable set]
+`s` has an event to deliver. At that point, the thread is in the "ready" state
+and can be nondeterministically resumed by the runtime's scheduler at which
+point `waitable-set.wait` will return the `event`. (The `none` `event` is used
+by `waitable-set.poll` and never returned by `waitable-set.wait`.)
 
-`waitable-set.wait` allows the runtime to cooperatively switch to other tasks
-to execute while the current task is blocked. If `waitable-set.wait` is
-called from a synchronous- or `async callback`-lifted export, no other
-synchronous or `async callback`-lifted tasks can start or progress in the
-current component instance (ensuring non-reentrance of the core wasm code).
-However, non-`callback` `async`-lifted ("stackful async") exports may start
-or progress at any time.
+Waitable sets may be `wait`ed upon when empty, in which case the caller will
+necessarily block until another thread adds a waitable to the set.
 
 If `cancellable` is set, `waitable-set.wait` may return `task-cancelled`
 (`6`) if the caller requests [cancellation] of the [current task]. If
@@ -1731,8 +1720,16 @@ If `cancellable` is set, `waitable-set.wait` may return `task-cancelled`
 `task-cancelled` is returned at most once for a given task and thus must be
 propagated once received.
 
+If `waitable-set.wait` is called from a synchronous- or `async callback`-lifted
+export, no other threads that were implicitly created by a separate
+synchronous- or `async callback`-lifted export call can start or progress in
+the current component instance until `waitable-set.wait` returns (thereby
+ensuring non-reentrance of the core wasm code). However, explicitly-created
+threads and threads implicitly created by non-`callback` `async`-lifted
+("stackful async") exports may start or progress at any time.
+
 A `subtask` event notifies the supertask that its subtask is now in the given
-state (the meanings of which are described by the [async explainer]).
+state (the meanings of which are described by the [concurrency explainer]).
 
 The meanings of the `{stream,future}-{read,write}` events/payloads are given as
 part [`stream.read` and `stream.write`](#-streamread-and-streamwrite) and
@@ -1753,14 +1750,11 @@ For details, see [`canon_waitable_set_wait`] in the Canonical ABI explainer.
 
 where `event` is defined as in [`waitable-set.wait`](#-waitable-setwait).
 
-The `waitable-set.poll` built-in returns the `event` `none` if no event
-was available without blocking. `poll` implicitly performs a `yield`, allowing
-other tasks to be scheduled before `poll` returns. If `waitable-set.poll` is
-called from a synchronous- or `async callback`-lifted export, no other
-synchronous or `async callback`-lifted tasks can start or progress in the
-current component instance (ensuring non-reentrance of the core wasm code).
-However, non-`callback` `async`-lifted ("stackful async") exports may start
-or progress at any time.
+The `waitable-set.poll` built-in [suspends][waiting] the [current thread] in
+the "ready" state (like `thread.yield`). Once nondeterministically resumed,
+`waitable-set.poll` will return either an event from one of the waitables in
+`s` or, if there is none, the `none` `event`. Thus, repeatedly calling
+`waitable-set.poll` in a loop allows other tasks to execute.
 
 If `cancellable` is set, `waitable-set.poll` may return `task-cancelled`
 (`6`) if the caller requests [cancellation] of the [current task]. If
@@ -1768,6 +1762,14 @@ If `cancellable` is set, `waitable-set.poll` may return `task-cancelled`
 `task-cancelled` is returned at most once for a given task and thus must be
 propagated once received.
 
+If `waitable-set.poll` is called from a synchronous- or `async callback`-lifted
+export, no other threads that were implicitly created by a separate
+synchronous- or `async callback`-lifted export call can start or progress in
+the current component instance until `waitable-set.poll` returns (thereby
+ensuring non-reentrance of the core wasm code). However, explicitly-created
+threads and threads implicitly created by non-`callback` `async`-lifted
+("stackful async") exports may start or progress at any time.
+
 The Canonical ABI of `waitable-set.poll` is the same as `waitable-set.wait`
 (with the `none` case indicated by returning `0`).
 
@@ -1782,7 +1784,7 @@ For details, see [`canon_waitable_set_poll`] in the Canonical ABI explainer.
 
 The `waitable-set.drop` built-in removes the indicated [waitable set] from the
 current component instance's table, trapping if the waitable set is not empty
-or if another task is concurrently `wait`ing on it.
+or if another thread is concurrently `wait`ing on it.
 
 For details, see [`canon_waitable_set_drop`] in the Canonical ABI explainer.
 
@@ -1915,9 +1917,9 @@ into the given buffer before the writer end dropped the stream. The `cancelled`
 case can only arise as the result of a call to `stream.cancel-{read,write}`.
 
 If the return value is `none`, then the operation blocked and the caller needs
-to [wait](Async.md#waiting) for progress (via `waitable-set.{wait,poll}` or, if
-using a `callback`, by returning to the event loop) which will asynchronously
-produce an `event` containing a `stream-result`.
+to [wait] for progress (via `waitable-set.{wait,poll}` or, if using a
+`callback`, by returning to the event loop) which will asynchronously produce
+an `event` containing a `stream-result`.
 
 If `stream.{read,write}` return `dropped` (synchronously or asynchronously),
 any subsequent operation on the stream other than `stream.drop-{readable,writable}`
@@ -1977,9 +1979,9 @@ future as the first parameter and, if `T` is present, a single-element buffer
 that can be used to write or read a single `T` value.
 
 If the return value is `none`, then the call blocked and the caller needs
-to [wait](Async.md#waiting) for progress (via `waitable-set.{wait,poll}` or, if
-using a `callback`, by returning to the event loop) which will asynchronously
-produce an `event` containing a `future-{read,write}-result`.
+to [wait] for progress (via `waitable-set.{wait,poll}` or, if using a
+`callback`, by returning to the event loop) which will asynchronously produce
+an `event` containing a `future-{read,write}-result`.
 
 If `future.{read,write}` return `completed` or `dropped` (synchronously or
 asynchronously), any subsequent operation on the future other than
@@ -2011,8 +2013,8 @@ For details, see [`canon_future_read`] in the Canonical ABI explainer.
 | Canonical ABI signature                             | `[e: i32] -> [i32]`                                                           |
 
 The `{stream,future}.cancel-{read,write}` built-ins take the matching [readable
-or writable end] of a stream or future that has a pending
-`{stream,future}.{read,write}`.
+or writable end] of a stream or future that has a pending `async`
+`{stream,future}.{read,write}` (trapping otherwise).
 
 If cancellation finishes without blocking, the return value is a
 `stream-result` or `future-{read,write}-result`. If cancellation blocks, the
@@ -2045,6 +2047,209 @@ value has not already been written.
 
 For details, see [`canon_stream_drop_readable`] in the Canonical ABI explainer.
 
+###### 🧵 `thread.index`
+
+| Synopsis                   |                 |
+| -------------------------- | --------------- |
+| Approximate WIT signature  | `func() -> u32` |
+| Canonical ABI signature    | `[] -> [i32]`   |
+
+The `thread.index` built-in returns the index of the [current thread] in the
+component instance's table. While `thread.new_indirect` also returns the index
+of newly-created threads, threads created implicitly for export calls can only
+learn their index via `thread.index`.
+
+For details, see [`canon_thread_index`] in the Canonical ABI explainer.
+
+###### 🧵 `thread.new_indirect`
+
+| Synopsis                   |                                                               |
+| -------------------------- | ------------------------------------------------------------- |
+| Approximate WIT signature  | `func<FuncT,tableidx>(fi: u32, c: FuncT.params[0]) -> thread` |
+| Canonical ABI signature    | `[fi:i32 c:i32] -> [i32]`                                     |
+
+The `thread.new_indirect` built-in adds a new thread to the current component
+instance's table, returning the index of the new thread. The function table
+supplied via [`core:tableidx`] is indexed by the `fi` operand and then
+dynamically checked to match the type `FuncT` (in the same manner as
+`call_indirect`). Lastly, the indexed function is called in the new thread
+with `c` as its first and only parameter.
+
+Currently, `FuncT` must be `(func (param i32))` and thus `c` must always an
+`i32`, but this restriction can be loosened in the future as the Canonical
+ABI is extended for [memory64] and [GC].
+
+As explained in the [concurrency explainer][waiting], a thread created by
+`thread.new_indirect` is initially in a suspended state and must be resumed
+eagerly or lazily by [`thread.yield-to`](#-threadyield-to) or
+[`thread.resume-later`](#-threadresume-later), resp., to begin execution.
+
+For details, see [`canon_thread_new_indirect`] in the Canonical ABI explainer.
+
+###### 🧵 `thread.switch-to`
+
+| Synopsis                   |                                                   |
+| -------------------------- | ------------------------------------------------- |
+| Approximate WIT signature  | `func<cancellable?>(t: thread) -> suspend-result` |
+| Canonical ABI signature    | `[t:i32] -> [i32]`                                |
+
+where `suspend-result` is defined in WIT as:
+```wit
+enum suspend-result { completed, cancelled }
+```
+
+The `thread.switch-to` built-in [suspends][waiting] the [current thread] and
+immediately resumes execution of the thread `t`, trapping if `t` is not in a
+"suspended" state. When the current thread is resumed by some other thread or,
+if `cancellable` was set, [cancellation], `thread.switch-to` will return,
+indicating what happened.
+
+If `thread.switch-to` is called from a synchronous- or `async callback`-lifted
+export, no other threads that were implicitly created by a separate
+synchronous- or `async callback`-lifted export call can start or progress in
+the current component instance until `thread.switch-to` returns (thereby
+ensuring non-reentrance of the core wasm code). However, explicitly-created
+threads and threads implicitly created by non-`callback` `async`-lifted
+("stackful async") exports may start or progress at any time.
+
+For details, see [`canon_thread_switch_to`] in the Canonical ABI explainer.
+
+###### 🧵 `thread.suspend`
+
+| Synopsis                   |                                          |
+| -------------------------- | ---------------------------------------- |
+| Approximate WIT signature  | `func<cancellable?>() -> suspend-result` |
+| Canonical ABI signature    | `[] -> i32`                              |
+
+The `thread.suspend` built-in [suspends][waiting] the [current thread] which,
+depending on the calling context, will either immediately switch control flow
+to an `async`-lowered caller or, if the current task has already suspended
+before, switch to the runtime's scheduler to find something else to do. When
+the current thread is resumed by some other thread or, if `cancellable` was
+set, [cancellation], `thread.suspend` will return, indicating what happened.
+
+If `thread.suspend` is called from a synchronous- or `async callback`-lifted
+export, no other threads that were implicitly created by a separate
+synchronous- or `async callback`-lifted export call can start or progress in
+the current component instance until `thread.suspend` returns (thereby
+ensuring non-reentrance of the core wasm code). However, explicitly-created
+threads and threads implicitly created by non-`callback` `async`-lifted
+("stackful async") exports may start or progress at any time.
+
+For details, see [`canon_thread_suspend`] in the Canonical ABI explainer.
+
+###### 🧵 `thread.resume-later`
+
+| Synopsis                   |                   |
+| -------------------------- | ----------------- |
+| Approximate WIT signature  | `func(t: thread)` |
+| Canonical ABI signature    | `[t:i32] -> []`   |
+
+The `thread.resume-later` built-in changes the state of thread `t` from
+"suspended" to "ready" (trapping if `t` is not in a "suspended" state) so that
+the runtime can nondeterministically resume `t` at some point in the future.
+
+For details, see [waiting] in the concurrency explainer and
+[`canon_thread_resume_later`] in the Canonical ABI explainer.
+
+###### 🧵 `thread.yield-to`
+
+| Synopsis                   |                                 |
+| -------------------------- | ------------------------------- |
+| Approximate WIT signature  | `func<cancellable?>(t: thread)` |
+| Canonical ABI signature    | `[t:i32] -> [suspend-result]`   |
+
+The `thread.yield-to` built-in immediately resumes execution of the thread `t`,
+(trapping if `t` is not in a "suspended" state) leaving the [current thread] in
+a "ready" state so that the runtime can nondeterministically resume the current
+thread at some point in the future. When the current thread is resumed either
+due to runtime scheduling or, if `cancellable` was set, [cancellation],
+`thread.yield-to` will return, indicating what happened.
+
+If `thread.yield-to` is called from a synchronous- or `async callback`-lifted
+export, no other threads that were implicitly created by a separate
+synchronous- or `async callback`-lifted export call can start or progress in
+the current component instance until `thread.yield-to` returns (thereby
+ensuring non-reentrance of the core wasm code). However, explicitly-created
+threads and threads implicitly created by non-`callback` `async`-lifted
+("stackful async") exports may start or progress at any time.
+
+For details, see [waiting] in the concurrency explainer and
+[`canon_thread_yield_to`] in the Canonical ABI explainer.
+
+###### 🧵 `thread.yield`
+
+| Synopsis                   |                                          |
+| -------------------------- | ---------------------------------------- |
+| Approximate WIT signature  | `func<cancellable?>() -> suspend-result` |
+| Canonical ABI signature    | `[] -> [i32]`                            |
+
+The `thread.yield` built-in allows the runtime to potentially switch to any
+other thread in the "ready" state, enabling a long-running computation to
+cooperatively interleave execution without specifically requesting another
+thread to be resumed (as with `thread.yield-to`). When the current thread is
+resumed either due to runtime scheduling or, if `cancellable` was set,
+[cancellation], `thread.yield` will return, indicating what happened.
+
+If `thread.yield` is called from a synchronous- or `async callback`-lifted
+export, no other threads that were implicitly created by a separate
+synchronous- or `async callback`-lifted export call can start or progress in
+the current component instance until `thread.yield` returns (thereby
+ensuring non-reentrance of the core wasm code). However, explicitly-created
+threads and threads implicitly created by non-`callback` `async`-lifted
+("stackful async") exports may start or progress at any time.
+
+For details, see [waiting] in the concurrency explainer and
+[`canon_thread_yield`] in the Canonical ABI explainer.
+
+###### 🧵② `thread.spawn_ref`
+
+| Synopsis                   |                                                                    |
+| -------------------------- | ------------------------------------------------------------------ |
+| Approximate WIT signature  | `func<shared?,FuncT>(f: FuncT, c: FuncT.params[0]) -> bool`        |
+| Canonical ABI signature    | `shared? [f:(ref null (shared (func (param i32))) c:i32] -> [i32]` |
+
+The `thread.spawn_ref` built-in is an optimization, fusing a call to
+`thread.new_ref` (assuming `thread.new_ref` was added as part of adding a
+[GC ABI option] to the Canonical ABI) with a call to
+[`thread.resume-later`](#-threadresume-later). This optimization is more
+impactful once given [shared-everything-threads] and thus gated on 🧵②.
+
+For details, see [`canon_thread_spawn_ref`] in the Canonical ABI explainer.
+
+###### 🧵② `thread.spawn_indirect`
+
+| Synopsis                   |                                                                    |
+| -------------------------- | ------------------------------------------------------------------ |
+| Approximate WIT signature  | `func<shared?,FuncT,tableidx>(i: u32, c: FuncT.params[0]) -> bool` |
+| Canonical ABI signature    | `shared? [i:i32 c:i32] -> [i32]`                                   |
+
+The `thread.spawn_indirect` built-in is an optimization, fusing a call to
+[`thread.new_indirect`](#-threadnew_indirect) with a call to
+[`thread.resume-later`](#-threadresume-later). This optimization is more
+impactful once given [shared-everything-threads] and thus gated on 🧵②.
+
+For details, see [`canon_thread_spawn_indirect`] in the Canonical ABI
+explainer.
+
+###### 🧵② `thread.available-parallelism`
+
+| Synopsis                   |                          |
+| -------------------------- | ------------------------ |
+| Approximate WIT signature  | `func<shared?>() -> u32` |
+| Canonical ABI signature    | `shared [] -> [i32]`     |
+
+The `thread.available-parallelism` built-in returns the number of threads that
+can be expected to execute in parallel.
+
+The concept of "available parallelism" corresponds is sometimes referred to
+as "hardware concurrency", such as in [`navigator.hardwareConcurrency`] in
+JavaScript.
+
+For details, see [`canon_thread_available_parallelism`] in the Canonical ABI
+explainer.
+
+
 ##### 📝 Error Context built-ins
 
 ###### 📝 `error-context.new`
@@ -2095,58 +2300,6 @@ instance's table.
 
 For details, see [`canon_error_context_drop`] in the Canonical ABI explainer.
 
-##### 🧵 Threading built-ins
-
-The [shared-everything-threads] proposal adds component model built-ins for
-thread management. These are specified as built-ins and not core WebAssembly
-instructions because browsers expect this functionality to come from existing
-Web/JS APIs.
-
-###### 🧵 `thread.spawn_ref`
-
-| Synopsis                   |                                                            |
-| -------------------------- | ---------------------------------------------------------- |
-| Approximate WIT signature  | `func<FuncT>(f: FuncT, c: FuncT.params[0]) -> bool`        |
-| Canonical ABI signature    | `[f:(ref null (shared (func (param i32))) c:i32] -> [i32]` |
-
-The `thread.spawn_ref` built-in spawns a new thread by invoking the shared
-function `f` while passing `c` to it, returning whether a thread was
-successfully spawned. While it's designed to allow different types in the
-future, the type of `c` is currently hard-coded to always be `i32`.
-
-For details, see [`canon_thread_spawn_ref`] in the Canonical ABI explainer.
-
-###### 🧵 `thread.spawn_indirect`
-
-| Synopsis                   |                                                   |
-| -------------------------- | ------------------------------------------------- |
-| Approximate WIT signature  | `func<FuncT>(i: u32, c: FuncT.params[0]) -> bool` |
-| Canonical ABI signature    | `[i:i32 c:i32] -> [i32]`                          |
-
-The `thread.spawn_indirect` built-in spawns a new thread by retrieving the
-shared function `f` from a table using index `i` and traps if the type of `f` is
-not equal to `FuncT` (much like the `call_indirect` core instruction). Once `f`
-is retrieved, this built-in operates like `thread.spawn_ref` above, including
-the limitations on `f`'s parameters.
-
-For details, see [`canon_thread_spawn_indirect`] in the Canonical ABI explainer.
-
-###### 🧵 `thread.available_parallelism`
-
-| Synopsis                   |                 |
-| -------------------------- | --------------- |
-| Approximate WIT signature  | `func() -> u32` |
-| Canonical ABI signature    | `[] -> [i32]`   |
-
-The `thread.available_parallelism` built-in returns the number of threads that
-can be expected to execute in parallel.
-
-The concept of "available parallelism" corresponds is sometimes referred to
-as "hardware concurrency", such as in [`navigator.hardwareConcurrency`] in
-JavaScript.
-
-For details, see [`canon_thread_available_parallelism`] in the Canonical ABI
-explainer.
 
 ### 🪙 Value Definitions
 
@@ -2521,7 +2674,7 @@ annotations trigger additional type-validation rules (listed in
 When a function is annotated with `async`, bindings generators are expected to
 emit whatever asynchronous language construct is appropriate (such as an
 `async` function in JS, Python or Rust). Note the absence of
-`[async constructor]`. See the [async explainer] for more details.
+`[async constructor]`. See the [concurrency explainer] for more details.
 
 The `label` production used inside `plainname` as well as the labels of
 `record` and `variant` types are required to have [kebab case]. The reason for
@@ -2733,7 +2886,7 @@ three runtime invariants:
    in the Canonical ABI explainer.) This default prevents obscure
    composition-time bugs and also enables more-efficient non-reentrant
    runtime glue code. This rule will be relaxed by an opt-in
-   function type attribute in the [future](Async.md#todo).
+   function type attribute in the [future](Concurrency.md#todo).
 
 
 ## JavaScript Embedding
@@ -2993,6 +3146,7 @@ For some use-case-focused, worked examples, see:
 [`core:datastring`]: https://webassembly.github.io/spec/core/text/modules.html#text-datastring
 [func-import-abbrev]: https://webassembly.github.io/spec/core/text/modules.html#text-func-abbrev
 [`core:version`]: https://webassembly.github.io/spec/core/binary/modules.html#binary-version
+[`core:tableidx`]: https://webassembly.github.io/spec/core/syntax/modules.html#syntax-tableidx
 
 [Embedder]: https://webassembly.github.io/spec/core/appendix/embedding.html
 [`module_instantiate`]: https://webassembly.github.io/spec/core/appendix/embedding.html#mathrm-module-instantiate-xref-exec-runtime-syntax-store-mathit-store-xref-syntax-modules-syntax-module-mathit-module-xref-exec-runtime-syntax-externval-mathit-externval-ast-xref-exec-runtime-syntax-store-mathit-store-xref-exec-runtime-syntax-moduleinst-mathit-moduleinst-xref-appendix-embedding-embed-error-mathit-error
@@ -3059,11 +3213,13 @@ For some use-case-focused, worked examples, see:
 [stack-switching]: https://github.com/WebAssembly/stack-switching/blob/main/proposals/stack-switching/Explainer.md
 [esm-integration]: https://github.com/WebAssembly/esm-integration/tree/main/proposals/esm-integration
 [gc]: https://github.com/WebAssembly/gc/blob/main/proposals/gc/MVP.md
+[memory64]: https://github.com/webAssembly/memory64
 [`rectype`]: https://webassembly.github.io/gc/core/text/types.html#text-rectype
 [shared-everything-threads]: https://github.com/WebAssembly/shared-everything-threads
 [WASI Preview 2]: https://github.com/WebAssembly/WASI/tree/main/wasip2#readme
 [WASI Preview 3]: https://github.com/WebAssembly/WASI/tree/main/wasip2#looking-forward-to-preview-3
 [reference types]: https://github.com/WebAssembly/reference-types/blob/master/proposals/reference-types/Overview.md
+[GC ABI Option]: https://github.com/WebAssembly/component-model/issues/525
 
 [Strongly-unique]: #name-uniqueness
 
@@ -3075,7 +3231,6 @@ For some use-case-focused, worked examples, see:
 [`canon_backpressure_{inc,dec}`]: CanonicalABI.md#-canon-backpressureincdec
 [`canon_task_return`]: CanonicalABI.md#-canon-taskreturn
 [`canon_task_cancel`]: CanonicalABI.md#-canon-taskcancel
-[`canon_yield`]: CanonicalABI.md#-canon-yield
 [`canon_waitable_set_new`]: CanonicalABI.md#-canon-waitable-setnew
 [`canon_waitable_set_wait`]: CanonicalABI.md#-canon-waitable-setwait
 [`canon_waitable_set_poll`]: CanonicalABI.md#-canon-waitable-setpoll
@@ -3095,6 +3250,13 @@ For some use-case-focused, worked examples, see:
 [`canon_error_context_new`]: CanonicalABI.md#-canon-error-contextnew
 [`canon_error_context_debug_message`]: CanonicalABI.md#-canon-error-contextdebug-message
 [`canon_error_context_drop`]: CanonicalABI.md#-canon-error-contextdrop
+[`canon_thread_index`]: CanonicalABI.md#-canon-threadindex
+[`canon_thread_new_indirect`]: CanonicalABI.md#-canon-threadnew_indirect
+[`canon_thread_suspend`]: CanonicalABI.md#-canon-threadsuspend
+[`canon_thread_switch_to`]: CanonicalABI.md#-canon-threadswitch-to
+[`canon_thread_resume_later`]: CanonicalABI.md#-canon-threadresume-later
+[`canon_thread_yield_to`]: CanonicalABI.md#-canon-threadyield-to
+[`canon_thread_yield`]: CanonicalABI.md#-canon-threadyield
 [`canon_thread_spawn_ref`]: CanonicalABI.md#-canon-threadspawn_ref
 [`canon_thread_spawn_indirect`]: CanonicalABI.md#-canon-threadspawn_indirect
 [`canon_thread_available_parallelism`]: CanonicalABI.md#-canon-threadavailable_parallelism
@@ -3102,21 +3264,25 @@ For some use-case-focused, worked examples, see:
 [Use Cases]: ../high-level/UseCases.md
 [Host Embeddings]: ../high-level/UseCases.md#hosts-embedding-components
 
-[Async Explainer]: Async.md
-[Task]: Async.md#task
-[Current Task]: Async.md#current-task
-[Context-Local Storage]: Async.md#context-local-storage
-[Subtask]: Async.md#structured-concurrency
-[Stream or Future]: Async.md#streams-and-futures
-[Readable and Writable Ends]: Async.md#streams-and-futures
-[Readable or Writable End]: Async.md#streams-and-futures
-[Waiting]: Async.md#waiting
-[Waitables]: Async.md#waiting
-[Waitable Set]: Async.md#waiting
-[Backpressure]: Async.md#backpressure
-[Returning]: Async.md#returning
-[Resolved]: Async.md#cancellation
-[Cancellation]: Async.md#cancellation
+[Concurrency Explainer]: Concurrency.md
+[Summary]: Concurrency.md#summary
+[Thread]: Concurrency.md#threads-and-tasks
+[Task]: Concurrency.md#threads-and-tasks
+[Current Thread]: Concurrency.md#current-thread-and-task
+[Current Task]: Concurrency.md#current-thread-and-task
+[Thread-Local Storage]: Concurrency.md#thread-local-storage
+[Subtask]: Concurrency.md#structured-concurrency
+[Stream or Future]: Concurrency.md#streams-and-futures
+[Readable and Writable Ends]: Concurrency.md#streams-and-futures
+[Readable or Writable End]: Concurrency.md#streams-and-futures
+[Wait]: Concurrency.md#waiting
+[Waiting]: Concurrency.md#waiting
+[Waitables]: Concurrency.md#waiting
+[Waitable Set]: Concurrency.md#waiting
+[Backpressure]: Concurrency.md#backpressure
+[Returning]: Concurrency.md#returning
+[Resolved]: Concurrency.md#cancellation
+[Cancellation]: Concurrency.md#cancellation
 
 [Component Model Documentation]: https://component-model.bytecodealliance.org
 [`wizer`]: https://github.com/bytecodealliance/wizer
diff --git a/design/mvp/WIT.md b/design/mvp/WIT.md
index dd0d425d..3e104df9 100644
--- a/design/mvp/WIT.md
+++ b/design/mvp/WIT.md
@@ -1458,8 +1458,8 @@ is just a hint and not enforced by the runtime, it is technically possible for
 a non-`async` callee to block. In that case, though, it is the *callee's* fault
 for any resultant loss of concurrency, not the caller's. Thus, `async` is
 primarily intended to document expectations in a way that can be taken
-advantage of by bindings generators. (For more details, see the [async
-explainer](Async.md#sync-and-async-functions).)
+advantage of by bindings generators. (For more details, see the [concurrency
+explainer](Concurrency.md).)
 
 
 ## Item: `use`
@@ -1780,8 +1780,8 @@ variant result {
 These types are so frequently used and frequently have language-specific
 meanings though so they're also provided as first-class types.
 
-The `future` and `stream` types are described as part of the [async
-explainer](Async.md#streams-and-futures).
+The `future` and `stream` types are described as part of the [concurrency
+explainer](Concurrency.md#streams-and-futures).
 
 Finally the last case of a `ty` is simply an `id` which is intended to refer to
 another type or resource defined in the document. Note that definitions can come
diff --git a/design/mvp/canonical-abi/definitions.py b/design/mvp/canonical-abi/definitions.py
index 9f45ae24..b8ce8ae2 100644
--- a/design/mvp/canonical-abi/definitions.py
+++ b/design/mvp/canonical-abi/definitions.py
@@ -52,6 +52,8 @@ class CoreFuncType(CoreExternType):
   def __eq__(self, other):
     return self.params == other.params and self.results == other.results
 
+CoreValType = int | float
+
 def types_match_values(ts, vs):
   if len(ts) != len(vs):
     return False
@@ -325,22 +327,6 @@ def __init__(self, impl, dtor = None, dtor_sync = True, dtor_callback = None):
     self.dtor_sync = dtor_sync
     self.dtor_callback = dtor_callback
 
-#### Context-Local Storage
-
-class ContextLocalStorage:
-  LENGTH = 1
-  array: list[int]
-
-  def __init__(self):
-    self.array = [0] * ContextLocalStorage.LENGTH
-
-  def set(self, i, v):
-    assert(types_match_values(['i32'], [v]))
-    self.array[i] = v
-
-  def get(self, i):
-    return self.array[i]
-
 #### Thread State
 
 class Thread:
@@ -352,6 +338,10 @@ class Thread:
   cancellable: bool
   cancelled: bool
   in_event_loop: bool
+  index: Optional[int]
+  context: list[int]
+
+  CONTEXT_LENGTH = 2
 
   def running(self):
     return self.parent_lock is not None
@@ -375,12 +365,16 @@ def __init__(self, task, thread_func):
     self.cancellable = False
     self.cancelled = False
     self.in_event_loop = False
+    self.index = None
+    self.context = [0] * Thread.CONTEXT_LENGTH
     def fiber_func():
       self.fiber_lock.acquire()
       assert(self.running())
-      thread_func()
+      thread_func(self)
       assert(self.running())
       self.task.thread_stop(self)
+      if self.index is not None:
+        self.task.inst.table.remove(self.index)
       self.parent_lock.release()
     self.fiber = threading.Thread(target = fiber_func)
     self.fiber.start()
@@ -389,6 +383,10 @@ def fiber_func():
 
   def resume(self, cancel = False):
     assert(not self.running() and not self.cancelled)
+    if self.ready_func:
+      assert(cancel or self.ready_func())
+      self.ready_func = None
+      self.task.inst.store.pending.remove(self)
     assert(self.cancellable or not cancel)
     self.cancelled = cancel
     self.parent_lock = threading.Lock()
@@ -410,18 +408,41 @@ def suspend(self, cancellable) -> bool:
     assert(cancellable or completed)
     return completed
 
+  def resume_later(self):
+    assert(self.suspended())
+    self.ready_func = lambda: True
+    self.task.inst.store.pending.append(self)
+
   def suspend_until(self, ready_func, cancellable = False) -> bool:
     assert(self.running())
     if ready_func() and not DETERMINISTIC_PROFILE and random.randint(0,1):
       return True
     self.ready_func = ready_func
     self.task.inst.store.pending.append(self)
-    completed = self.suspend(cancellable)
-    assert(cancellable or ready_func())
-    self.ready_func = None
-    self.task.inst.store.pending.remove(self)
+    return self.suspend(cancellable)
+
+  def switch_to(self, cancellable, other: Thread) -> bool:
+    assert(self.running() and other.suspended())
+    assert(not self.cancellable)
+    self.cancellable = cancellable
+    assert(self.parent_lock and not other.parent_lock)
+    other.parent_lock = self.parent_lock
+    self.parent_lock = None
+    assert(not self.running() and other.running())
+    other.fiber_lock.release()
+    self.fiber_lock.acquire()
+    assert(self.running())
+    self.cancellable = False
+    completed = not self.cancelled
+    self.cancelled = False
     return completed
 
+  def yield_to(self, cancellable, other: Thread) -> bool:
+    assert(not self.ready_func)
+    self.ready_func = lambda: True
+    self.task.inst.store.pending.append(self)
+    return self.switch_to(cancellable, other)
+
 #### Waitable State
 
 class EventCode(IntEnum):
@@ -504,8 +525,7 @@ class State(Enum):
   supertask: Optional[Task]
   on_resolve: OnResolve
   num_borrows: int
-  thread: Optional[Thread]
-  context: ContextLocalStorage
+  threads: list[Thread]
 
   def __init__(self, opts, inst, ft, supertask, on_resolve):
     self.state = Task.State.INITIAL
@@ -515,18 +535,18 @@ def __init__(self, opts, inst, ft, supertask, on_resolve):
     self.supertask = supertask
     self.on_resolve = on_resolve
     self.num_borrows = 0
-    self.thread = None
-    self.context = ContextLocalStorage()
+    self.threads = []
 
   def thread_start(self, thread):
-    assert(self.thread is None and thread.task is self)
-    self.thread = thread
+    assert(thread not in self.threads and thread.task is self)
+    self.threads.append(thread)
 
   def thread_stop(self, thread):
-    assert(thread is self.thread and thread.task is self)
-    self.thread = None
-    trap_if(self.state != Task.State.RESOLVED)
-    assert(self.num_borrows == 0)
+    assert(thread in self.threads and thread.task is self)
+    self.threads.remove(thread)
+    if len(self.threads) == 0:
+      trap_if(self.state != Task.State.RESOLVED)
+      assert(self.num_borrows == 0)
 
   def trap_if_on_the_stack(self, inst):
     c = self.supertask
@@ -537,13 +557,13 @@ def trap_if_on_the_stack(self, inst):
   def needs_exclusive(self):
     return self.opts.sync or self.opts.callback
 
-  def enter(self):
-    assert(self.thread is not None)
+  def enter(self, thread):
+    assert(thread in self.threads and thread.task is self)
     def has_backpressure():
       return self.inst.backpressure > 0 or (self.needs_exclusive() and self.inst.exclusive)
     if has_backpressure() or self.inst.num_waiting_to_enter > 0:
       self.inst.num_waiting_to_enter += 1
-      completed = self.thread.suspend_until(lambda: not has_backpressure(), cancellable = True)
+      completed = thread.suspend_until(lambda: not has_backpressure(), cancellable = True)
       self.inst.num_waiting_to_enter -= 1
       if not completed:
         self.cancel()
@@ -554,39 +574,67 @@ def has_backpressure():
     return True
 
   def exit(self):
-    assert(self.thread is not None)
+    assert(len(self.threads) > 0)
     if self.needs_exclusive():
       assert(self.inst.exclusive)
       self.inst.exclusive = False
 
   def request_cancellation(self):
     assert(self.state == Task.State.INITIAL)
-    if self.thread.cancellable and not (self.thread.in_event_loop and self.inst.exclusive):
-      self.state = Task.State.CANCEL_DELIVERED
-      self.thread.resume(cancel = True)
-    else:
-      self.state = Task.State.PENDING_CANCEL
+    random.shuffle(self.threads)
+    for thread in self.threads:
+      if thread.cancellable and not (thread.in_event_loop and self.inst.exclusive):
+        self.state = Task.State.CANCEL_DELIVERED
+        thread.resume(cancel = True)
+        return
+    self.state = Task.State.PENDING_CANCEL
 
-  def suspend_until(self, ready_func, cancellable) -> bool:
+  def deliver_pending_cancel(self, cancellable) -> bool:
     if cancellable and self.state == Task.State.PENDING_CANCEL:
       self.state = Task.State.CANCEL_DELIVERED
+      return True
+    return False
+
+  def suspend(self, thread, cancellable) -> bool:
+    assert(thread in self.threads and thread.task is self)
+    if self.deliver_pending_cancel(cancellable):
+      return False
+    return thread.suspend(cancellable)
+
+  def suspend_until(self, ready_func, thread, cancellable) -> bool:
+    assert(thread in self.threads and thread.task is self)
+    if self.deliver_pending_cancel(cancellable):
       return False
-    return self.thread.suspend_until(ready_func, cancellable)
+    return thread.suspend_until(ready_func, cancellable)
 
-  def wait_until(self, ready_func, wset, cancellable) -> EventTuple:
+  def switch_to(self, thread, cancellable, other_thread) -> bool:
+    assert(thread in self.threads and thread.task is self)
+    if self.deliver_pending_cancel(cancellable):
+      return False
+    return thread.switch_to(cancellable, other_thread)
+
+  def yield_to(self, thread, cancellable, other_thread) -> bool:
+    assert(thread in self.threads and thread.task is self)
+    if self.deliver_pending_cancel(cancellable):
+      return False
+    return thread.yield_to(cancellable, other_thread)
+
+  def wait_until(self, ready_func, thread, wset, cancellable) -> EventTuple:
+    assert(thread in self.threads and thread.task is self)
     wset.num_waiting += 1
     def ready_and_has_event():
       return ready_func() and wset.has_pending_event()
-    if not self.suspend_until(ready_and_has_event, cancellable):
+    if not self.suspend_until(ready_and_has_event, thread, cancellable):
       event = (EventCode.TASK_CANCELLED, 0, 0)
     else:
       event = wset.get_pending_event()
     wset.num_waiting -= 1
     return event
 
-  def poll_until(self, ready_func, wset, cancellable) -> Optional[EventTuple]:
+  def poll_until(self, ready_func, thread, wset, cancellable) -> Optional[EventTuple]:
+    assert(thread in self.threads and thread.task is self)
     wset.num_waiting += 1
-    if not self.suspend_until(ready_func, cancellable):
+    if not self.suspend_until(ready_func, thread, cancellable):
       event = (EventCode.TASK_CANCELLED, 0, 0)
     elif wset.has_pending_event():
       event = wset.get_pending_event()
@@ -595,8 +643,9 @@ def poll_until(self, ready_func, wset, cancellable) -> Optional[EventTuple]:
     wset.num_waiting -= 1
     return event
 
-  def yield_until(self, ready_func, cancellable) -> EventTuple:
-    if not self.suspend_until(ready_func, cancellable):
+  def yield_until(self, ready_func, thread, cancellable) -> EventTuple:
+    assert(thread in self.threads and thread.task is self)
+    if not self.suspend_until(ready_func, thread, cancellable):
       return (EventCode.TASK_CANCELLED, 0, 0)
     else:
       return (EventCode.NONE, 0, 0)
@@ -822,8 +871,9 @@ def write(self, inst, src_buffer, on_copy, on_copy_done):
 
 class CopyState(Enum):
   IDLE = 1
-  COPYING = 2
-  DONE = 3
+  SYNC_COPYING = 2
+  ASYNC_COPYING = 3
+  DONE = 4
 
 class CopyEnd(Waitable):
   state: CopyState
@@ -832,8 +882,11 @@ def __init__(self):
     Waitable.__init__(self)
     self.state = CopyState.IDLE
 
+  def copying(self):
+    return self.state == CopyState.SYNC_COPYING or self.state == CopyState.ASYNC_COPYING
+
   def drop(self):
-    trap_if(self.state == CopyState.COPYING)
+    trap_if(self.copying())
     Waitable.drop(self)
 
 class ReadableStreamEnd(CopyEnd):
@@ -1919,10 +1972,13 @@ def lower_flat_values(cx, max_flat, vs, ts, out_param = None):
 def canon_lift(opts, inst, ft, callee, caller, on_start, on_resolve) -> Call:
   task = Task(opts, inst, ft, caller, on_resolve)
   task.trap_if_on_the_stack(inst)
-  def thread_func():
-    if not task.enter():
+  def thread_func(thread):
+    if not task.enter(thread):
       return
 
+    assert(thread.index is None)
+    thread.index = inst.table.add(thread)
+
     cx = LiftLowerContext(opts, inst, task)
     args = on_start()
     flat_args = lower_flat_values(cx, MAX_FLAT_PARAMS, args, ft.param_types())
@@ -1930,43 +1986,43 @@ def thread_func():
     assert(types_match_values(flat_ft.params, flat_args))
 
     if opts.sync:
-      flat_results = call_and_trap_on_throw(callee, task, flat_args)
+      flat_results = call_and_trap_on_throw(callee, thread, flat_args)
       assert(types_match_values(flat_ft.results, flat_results))
       result = lift_flat_values(cx, MAX_FLAT_RESULTS, CoreValueIter(flat_results), ft.result_type())
       task.return_(result)
       if opts.post_return is not None:
         inst.may_leave = False
-        [] = call_and_trap_on_throw(opts.post_return, task, flat_results)
+        [] = call_and_trap_on_throw(opts.post_return, thread, flat_results)
         inst.may_leave = True
       task.exit()
       return
 
     if not opts.callback:
-      [] = call_and_trap_on_throw(callee, task, flat_args)
+      [] = call_and_trap_on_throw(callee, thread, flat_args)
       assert(types_match_values(flat_ft.results, []))
       task.exit()
       return
 
-    [packed] = call_and_trap_on_throw(callee, task, flat_args)
+    [packed] = call_and_trap_on_throw(callee, thread, flat_args)
     code,si = unpack_callback_result(packed)
     while code != CallbackCode.EXIT:
       thread.in_event_loop = True
       inst.exclusive = False
       match code:
         case CallbackCode.YIELD:
-          event = task.yield_until(lambda: not inst.exclusive, cancellable = True)
+          event = task.yield_until(lambda: not inst.exclusive, thread, cancellable = True)
         case CallbackCode.WAIT:
           wset = inst.table.get(si)
           trap_if(not isinstance(wset, WaitableSet))
-          event = task.wait_until(lambda: not inst.exclusive, wset, cancellable = True)
+          event = task.wait_until(lambda: not inst.exclusive, thread, wset, cancellable = True)
         case CallbackCode.POLL:
           wset = inst.table.get(si)
           trap_if(not isinstance(wset, WaitableSet))
-          event = task.poll_until(lambda: not inst.exclusive, wset, cancellable = True)
+          event = task.poll_until(lambda: not inst.exclusive, thread, wset, cancellable = True)
       thread.in_event_loop = False
       inst.exclusive = True
       event_code, p1, p2 = event
-      [packed] = call_and_trap_on_throw(opts.callback, task, [event_code, p1, p2])
+      [packed] = call_and_trap_on_throw(opts.callback, thread, [event_code, p1, p2])
       code,si = unpack_callback_result(packed)
     task.exit()
     return
@@ -1990,18 +2046,18 @@ def unpack_callback_result(packed):
   waitable_set_index = packed >> 4
   return (CallbackCode(code), waitable_set_index)
 
-def call_and_trap_on_throw(callee, task, args):
+def call_and_trap_on_throw(callee, thread, args):
   try:
-    return callee(task, args)
+    return callee(thread, args)
   except CoreWebAssemblyException:
     trap()
 
 ### `canon lower`
 
-def canon_lower(opts, ft, callee: FuncInst, task, flat_args):
-  trap_if(not task.inst.may_leave)
+def canon_lower(opts, ft, callee: FuncInst, thread, flat_args):
+  trap_if(not thread.task.inst.may_leave)
   subtask = Subtask()
-  cx = LiftLowerContext(opts, task.inst, subtask)
+  cx = LiftLowerContext(opts, thread.task.inst, subtask)
 
   flat_ft = flatten_functype(opts, ft, 'lower')
   assert(types_match_values(flat_ft.params, flat_args))
@@ -2038,11 +2094,11 @@ def on_resolve(result):
       nonlocal flat_results
       flat_results = lower_flat_values(cx, max_flat_results, result, ft.result_type(), flat_args)
 
-  subtask.callee = callee(task, on_start, on_resolve)
+  subtask.callee = callee(thread.task, on_start, on_resolve)
 
   if opts.sync:
     if not subtask.resolved():
-      task.thread.suspend_until(subtask.resolved)
+      thread.suspend_until(subtask.resolved)
     assert(types_match_values(flat_ft.results, flat_results))
     subtask.deliver_resolve()
     return flat_results
@@ -2052,7 +2108,7 @@ def on_resolve(result):
       subtask.deliver_resolve()
       return [Subtask.State.RETURNED]
     else:
-      subtaski = task.inst.table.add(subtask)
+      subtaski = thread.task.inst.table.add(subtask)
       def on_progress():
         def subtask_event():
           if subtask.resolved():
@@ -2065,17 +2121,17 @@ def subtask_event():
 
 ### `canon resource.new`
 
-def canon_resource_new(rt, task, rep):
-  trap_if(not task.inst.may_leave)
+def canon_resource_new(rt, thread, rep):
+  trap_if(not thread.task.inst.may_leave)
   h = ResourceHandle(rt, rep, own = True)
-  i = task.inst.table.add(h)
+  i = thread.task.inst.table.add(h)
   return [i]
 
 ### `canon resource.drop`
 
-def canon_resource_drop(rt, sync, task, i):
-  trap_if(not task.inst.may_leave)
-  inst = task.inst
+def canon_resource_drop(rt, sync, thread, i):
+  trap_if(not thread.task.inst.may_leave)
+  inst = thread.task.inst
   h = inst.table.remove(i)
   trap_if(not isinstance(h, ResourceHandle))
   trap_if(h.rt is not rt)
@@ -2092,60 +2148,61 @@ def canon_resource_drop(rt, sync, task, i):
         callee_opts = CanonicalOptions(sync = rt.dtor_sync, callback = rt.dtor_callback)
         ft = FuncType([U32Type()],[])
         callee = partial(canon_lift, callee_opts, rt.impl, ft, rt.dtor)
-        flat_results = canon_lower(caller_opts, ft, callee, task, [h.rep])
+        flat_results = canon_lower(caller_opts, ft, callee, thread, [h.rep])
       else:
-        task.trap_if_on_the_stack(rt.impl)
+        thread.task.trap_if_on_the_stack(rt.impl)
   else:
     h.borrow_scope.num_borrows -= 1
   return flat_results
 
 ### `canon resource.rep`
 
-def canon_resource_rep(rt, task, i):
-  h = task.inst.table.get(i)
+def canon_resource_rep(rt, thread, i):
+  h = thread.task.inst.table.get(i)
   trap_if(not isinstance(h, ResourceHandle))
   trap_if(h.rt is not rt)
   return [h.rep]
 
 ### 🔀 `canon context.get`
 
-def canon_context_get(t, i, task):
+def canon_context_get(t, i, thread):
   assert(t == 'i32')
-  assert(i < ContextLocalStorage.LENGTH)
-  return [task.context.get(i)]
+  assert(i < Thread.CONTEXT_LENGTH)
+  return [thread.context[i]]
 
 ### 🔀 `canon context.set`
 
-def canon_context_set(t, i, task, v):
+def canon_context_set(t, i, thread, v):
   assert(t == 'i32')
-  assert(i < ContextLocalStorage.LENGTH)
-  task.context.set(i, v)
+  assert(i < Thread.CONTEXT_LENGTH)
+  thread.context[i] = v
   return []
 
 ### 🔀 `canon backpressure.set`
 
-def canon_backpressure_set(task, flat_args):
+def canon_backpressure_set(thread, flat_args):
   assert(len(flat_args) == 1)
-  task.inst.backpressure = int(bool(flat_args[0]))
+  thread.task.inst.backpressure = int(bool(flat_args[0]))
   return []
 
 ### 🔀 `canon backpressure.{inc,dec}`
 
-def canon_backpressure_inc(task):
-  assert(0 <= task.inst.backpressure < 2**16)
-  task.inst.backpressure += 1
-  trap_if(task.inst.backpressure == 2**16)
+def canon_backpressure_inc(thread):
+  assert(0 <= thread.task.inst.backpressure < 2**16)
+  thread.task.inst.backpressure += 1
+  trap_if(thread.task.inst.backpressure == 2**16)
   return []
 
-def canon_backpressure_dec(task):
-  assert(0 <= task.inst.backpressure < 2**16)
-  task.inst.backpressure -= 1
-  trap_if(task.inst.backpressure < 0)
+def canon_backpressure_dec(thread):
+  assert(0 <= thread.task.inst.backpressure < 2**16)
+  thread.task.inst.backpressure -= 1
+  trap_if(thread.task.inst.backpressure < 0)
   return []
 
 ### 🔀 `canon task.return`
 
-def canon_task_return(task, result_type, opts: LiftOptions, flat_args):
+def canon_task_return(thread, result_type, opts: LiftOptions, flat_args):
+  task = thread.task
   trap_if(not task.inst.may_leave)
   trap_if(task.opts.sync)
   trap_if(result_type != task.ft.result)
@@ -2157,73 +2214,63 @@ def canon_task_return(task, result_type, opts: LiftOptions, flat_args):
 
 ### 🔀 `canon task.cancel`
 
-def canon_task_cancel(task):
+def canon_task_cancel(thread):
+  task = thread.task
   trap_if(not task.inst.may_leave)
   trap_if(task.opts.sync)
   task.cancel()
   return []
 
-### 🔀 `canon yield`
-
-def canon_yield(cancellable, task):
-  trap_if(not task.inst.may_leave)
-  event_code,_,_ = task.yield_until(lambda:True, cancellable)
-  match event_code:
-    case EventCode.NONE:
-      return [0]
-    case EventCode.TASK_CANCELLED:
-      return [1]
-
 ### 🔀 `canon waitable-set.new`
 
-def canon_waitable_set_new(task):
-  trap_if(not task.inst.may_leave)
-  return [ task.inst.table.add(WaitableSet()) ]
+def canon_waitable_set_new(thread):
+  trap_if(not thread.task.inst.may_leave)
+  return [ thread.task.inst.table.add(WaitableSet()) ]
 
 ### 🔀 `canon waitable-set.wait`
 
-def canon_waitable_set_wait(cancellable, mem, task, si, ptr):
-  trap_if(not task.inst.may_leave)
-  wset = task.inst.table.get(si)
+def canon_waitable_set_wait(cancellable, mem, thread, si, ptr):
+  trap_if(not thread.task.inst.may_leave)
+  wset = thread.task.inst.table.get(si)
   trap_if(not isinstance(wset, WaitableSet))
-  event = task.wait_until(lambda:True, wset, cancellable)
-  return unpack_event(mem, task, ptr, event)
+  event = thread.task.wait_until(lambda: True, thread, wset, cancellable)
+  return unpack_event(mem, thread, ptr, event)
 
-def unpack_event(mem, task, ptr, e: EventTuple):
+def unpack_event(mem, thread, ptr, e: EventTuple):
   event, p1, p2 = e
-  cx = LiftLowerContext(LiftLowerOptions(memory = mem), task.inst)
+  cx = LiftLowerContext(LiftLowerOptions(memory = mem), thread.task.inst)
   store(cx, p1, U32Type(), ptr)
   store(cx, p2, U32Type(), ptr + 4)
   return [event]
 
 ### 🔀 `canon waitable-set.poll`
 
-def canon_waitable_set_poll(cancellable, mem, task, si, ptr):
-  trap_if(not task.inst.may_leave)
-  wset = task.inst.table.get(si)
+def canon_waitable_set_poll(cancellable, mem, thread, si, ptr):
+  trap_if(not thread.task.inst.may_leave)
+  wset = thread.task.inst.table.get(si)
   trap_if(not isinstance(wset, WaitableSet))
-  event = task.poll_until(lambda:True, wset, cancellable)
-  return unpack_event(mem, task, ptr, event)
+  event = thread.task.poll_until(lambda: True, thread, wset, cancellable)
+  return unpack_event(mem, thread, ptr, event)
 
 ### 🔀 `canon waitable-set.drop`
 
-def canon_waitable_set_drop(task, i):
-  trap_if(not task.inst.may_leave)
-  wset = task.inst.table.remove(i)
+def canon_waitable_set_drop(thread, i):
+  trap_if(not thread.task.inst.may_leave)
+  wset = thread.task.inst.table.remove(i)
   trap_if(not isinstance(wset, WaitableSet))
   wset.drop()
   return []
 
 ### 🔀 `canon waitable.join`
 
-def canon_waitable_join(task, wi, si):
-  trap_if(not task.inst.may_leave)
-  w = task.inst.table.get(wi)
+def canon_waitable_join(thread, wi, si):
+  trap_if(not thread.task.inst.may_leave)
+  w = thread.task.inst.table.get(wi)
   trap_if(not isinstance(w, Waitable))
   if si == 0:
     w.join(None)
   else:
-    wset = task.inst.table.get(si)
+    wset = thread.task.inst.table.get(si)
     trap_if(not isinstance(wset, WaitableSet))
     w.join(wset)
   return []
@@ -2232,9 +2279,9 @@ def canon_waitable_join(task, wi, si):
 
 BLOCKED = 0xffff_ffff
 
-def canon_subtask_cancel(sync, task, i):
-  trap_if(not task.inst.may_leave)
-  subtask = task.inst.table.get(i)
+def canon_subtask_cancel(sync, thread, i):
+  trap_if(not thread.task.inst.may_leave)
+  subtask = thread.task.inst.table.get(i)
   trap_if(not isinstance(subtask, Subtask))
   trap_if(subtask.resolve_delivered())
   trap_if(subtask.cancellation_requested)
@@ -2245,7 +2292,7 @@ def canon_subtask_cancel(sync, task, i):
     subtask.callee.request_cancellation()
     if not subtask.resolved():
       if sync:
-        task.thread.suspend_until(subtask.resolved)
+        thread.suspend_until(subtask.resolved)
       else:
         return [BLOCKED]
   code,index,payload = subtask.get_pending_event()
@@ -2255,53 +2302,52 @@ def canon_subtask_cancel(sync, task, i):
 
 ### 🔀 `canon subtask.drop`
 
-def canon_subtask_drop(task, i):
-  trap_if(not task.inst.may_leave)
-  s = task.inst.table.remove(i)
+def canon_subtask_drop(thread, i):
+  trap_if(not thread.task.inst.may_leave)
+  s = thread.task.inst.table.remove(i)
   trap_if(not isinstance(s, Subtask))
   s.drop()
   return []
 
 ### 🔀 `canon {stream,future}.new`
 
-def canon_stream_new(stream_t, task):
-  trap_if(not task.inst.may_leave)
+def canon_stream_new(stream_t, thread):
+  trap_if(not thread.task.inst.may_leave)
   shared = SharedStreamImpl(stream_t.t)
-  ri = task.inst.table.add(ReadableStreamEnd(shared))
-  wi = task.inst.table.add(WritableStreamEnd(shared))
+  ri = thread.task.inst.table.add(ReadableStreamEnd(shared))
+  wi = thread.task.inst.table.add(WritableStreamEnd(shared))
   return [ ri | (wi << 32) ]
 
-def canon_future_new(future_t, task):
-  trap_if(not task.inst.may_leave)
+def canon_future_new(future_t, thread):
+  trap_if(not thread.task.inst.may_leave)
   shared = SharedFutureImpl(future_t.t)
-  ri = task.inst.table.add(ReadableFutureEnd(shared))
-  wi = task.inst.table.add(WritableFutureEnd(shared))
+  ri = thread.task.inst.table.add(ReadableFutureEnd(shared))
+  wi = thread.task.inst.table.add(WritableFutureEnd(shared))
   return [ ri | (wi << 32) ]
 
 ### 🔀 `canon stream.{read,write}`
 
-def canon_stream_read(stream_t, opts, task, i, ptr, n):
+def canon_stream_read(stream_t, opts, thread, i, ptr, n):
   return stream_copy(ReadableStreamEnd, WritableBufferGuestImpl, EventCode.STREAM_READ,
-                     stream_t, opts, task, i, ptr, n)
+                     stream_t, opts, thread, i, ptr, n)
 
-def canon_stream_write(stream_t, opts, task, i, ptr, n):
+def canon_stream_write(stream_t, opts, thread, i, ptr, n):
   return stream_copy(WritableStreamEnd, ReadableBufferGuestImpl, EventCode.STREAM_WRITE,
-                     stream_t, opts, task, i, ptr, n)
+                     stream_t, opts, thread, i, ptr, n)
 
-def stream_copy(EndT, BufferT, event_code, stream_t, opts, task, i, ptr, n):
-  trap_if(not task.inst.may_leave)
-  e = task.inst.table.get(i)
+def stream_copy(EndT, BufferT, event_code, stream_t, opts, thread, i, ptr, n):
+  trap_if(not thread.task.inst.may_leave)
+  e = thread.task.inst.table.get(i)
   trap_if(not isinstance(e, EndT))
   trap_if(e.shared.t != stream_t.t)
   trap_if(e.state != CopyState.IDLE)
 
   assert(not contains_borrow(stream_t))
-  cx = LiftLowerContext(opts, task.inst, borrow_scope = None)
+  cx = LiftLowerContext(opts, thread.task.inst, borrow_scope = None)
   buffer = BufferT(stream_t.t, cx, ptr, n)
 
   def stream_event(result, reclaim_buffer):
     reclaim_buffer()
-    assert(e.state == CopyState.COPYING)
     if result == CopyResult.DROPPED:
       e.state = CopyState.DONE
     else:
@@ -2317,13 +2363,14 @@ def on_copy(reclaim_buffer):
   def on_copy_done(result):
     e.set_pending_event(partial(stream_event, result, reclaim_buffer = lambda:()))
 
-  e.state = CopyState.COPYING
-  e.copy(task.inst, buffer, on_copy, on_copy_done)
+  e.copy(thread.task.inst, buffer, on_copy, on_copy_done)
 
   if not e.has_pending_event():
     if opts.sync:
-      task.thread.suspend_until(e.has_pending_event)
+      e.state = CopyState.SYNC_COPYING
+      thread.suspend_until(e.has_pending_event)
     else:
+      e.state = CopyState.ASYNC_COPYING
       return [BLOCKED]
   code,index,payload = e.get_pending_event()
   assert(code == event_code and index == i and payload != BLOCKED)
@@ -2331,28 +2378,27 @@ def on_copy_done(result):
 
 ### 🔀 `canon future.{read,write}`
 
-def canon_future_read(future_t, opts, task, i, ptr):
+def canon_future_read(future_t, opts, thread, i, ptr):
   return future_copy(ReadableFutureEnd, WritableBufferGuestImpl, EventCode.FUTURE_READ,
-                     future_t, opts, task, i, ptr)
+                     future_t, opts, thread, i, ptr)
 
-def canon_future_write(future_t, opts, task, i, ptr):
+def canon_future_write(future_t, opts, thread, i, ptr):
   return future_copy(WritableFutureEnd, ReadableBufferGuestImpl, EventCode.FUTURE_WRITE,
-                     future_t, opts, task, i, ptr)
+                     future_t, opts, thread, i, ptr)
 
-def future_copy(EndT, BufferT, event_code, future_t, opts, task, i, ptr):
-  trap_if(not task.inst.may_leave)
-  e = task.inst.table.get(i)
+def future_copy(EndT, BufferT, event_code, future_t, opts, thread, i, ptr):
+  trap_if(not thread.task.inst.may_leave)
+  e = thread.task.inst.table.get(i)
   trap_if(not isinstance(e, EndT))
   trap_if(e.shared.t != future_t.t)
   trap_if(e.state != CopyState.IDLE)
 
   assert(not contains_borrow(future_t))
-  cx = LiftLowerContext(opts, task.inst, borrow_scope = None)
+  cx = LiftLowerContext(opts, thread.task.inst, borrow_scope = None)
   buffer = BufferT(future_t.t, cx, ptr, 1)
 
   def future_event(result):
     assert((buffer.remain() == 0) == (result == CopyResult.COMPLETED))
-    assert(e.state == CopyState.COPYING)
     if result == CopyResult.DROPPED or result == CopyResult.COMPLETED:
       e.state = CopyState.DONE
     else:
@@ -2363,13 +2409,14 @@ def on_copy_done(result):
     assert(result != CopyResult.DROPPED or event_code == EventCode.FUTURE_WRITE)
     e.set_pending_event(partial(future_event, result))
 
-  e.state = CopyState.COPYING
-  e.copy(task.inst, buffer, on_copy_done)
+  e.copy(thread.task.inst, buffer, on_copy_done)
 
   if not e.has_pending_event():
     if opts.sync:
-      task.thread.suspend_until(e.has_pending_event)
+      e.state = CopyState.SYNC_COPYING
+      thread.suspend_until(e.has_pending_event)
     else:
+      e.state = CopyState.ASYNC_COPYING
       return [BLOCKED]
   code,index,payload = e.get_pending_event()
   assert(code == event_code and index == i)
@@ -2377,88 +2424,174 @@ def on_copy_done(result):
 
 ### 🔀 `canon {stream,future}.cancel-{read,write}`
 
-def canon_stream_cancel_read(stream_t, sync, task, i):
-  return cancel_copy(ReadableStreamEnd, EventCode.STREAM_READ, stream_t, sync, task, i)
+def canon_stream_cancel_read(stream_t, sync, thread, i):
+  return cancel_copy(ReadableStreamEnd, EventCode.STREAM_READ, stream_t, sync, thread, i)
 
-def canon_stream_cancel_write(stream_t, sync, task, i):
-  return cancel_copy(WritableStreamEnd, EventCode.STREAM_WRITE, stream_t, sync, task, i)
+def canon_stream_cancel_write(stream_t, sync, thread, i):
+  return cancel_copy(WritableStreamEnd, EventCode.STREAM_WRITE, stream_t, sync, thread, i)
 
-def canon_future_cancel_read(future_t, sync, task, i):
-  return cancel_copy(ReadableFutureEnd, EventCode.FUTURE_READ, future_t, sync, task, i)
+def canon_future_cancel_read(future_t, sync, thread, i):
+  return cancel_copy(ReadableFutureEnd, EventCode.FUTURE_READ, future_t, sync, thread, i)
 
-def canon_future_cancel_write(future_t, sync, task, i):
-  return cancel_copy(WritableFutureEnd, EventCode.FUTURE_WRITE, future_t, sync, task, i)
+def canon_future_cancel_write(future_t, sync, thread, i):
+  return cancel_copy(WritableFutureEnd, EventCode.FUTURE_WRITE, future_t, sync, thread, i)
 
-def cancel_copy(EndT, event_code, stream_or_future_t, sync, task, i):
-  trap_if(not task.inst.may_leave)
-  e = task.inst.table.get(i)
+def cancel_copy(EndT, event_code, stream_or_future_t, sync, thread, i):
+  trap_if(not thread.task.inst.may_leave)
+  e = thread.task.inst.table.get(i)
   trap_if(not isinstance(e, EndT))
   trap_if(e.shared.t != stream_or_future_t.t)
-  trap_if(e.state != CopyState.COPYING)
+  trap_if(e.state != CopyState.ASYNC_COPYING)
   if not e.has_pending_event():
     e.shared.cancel()
     if not e.has_pending_event():
       if sync:
-        task.thread.suspend_until(e.has_pending_event)
+        thread.suspend_until(e.has_pending_event)
       else:
         return [BLOCKED]
   code,index,payload = e.get_pending_event()
-  assert(e.state != CopyState.COPYING and code == event_code and index == i)
+  assert(not e.copying() and code == event_code and index == i)
   return [payload]
 
 ### 🔀 `canon {stream,future}.drop-{readable,writable}`
 
-def canon_stream_drop_readable(stream_t, task, i):
-  return drop(ReadableStreamEnd, stream_t, task, i)
+def canon_stream_drop_readable(stream_t, thread, i):
+  return drop(ReadableStreamEnd, stream_t, thread, i)
 
-def canon_stream_drop_writable(stream_t, task, hi):
-  return drop(WritableStreamEnd, stream_t, task, hi)
+def canon_stream_drop_writable(stream_t, thread, hi):
+  return drop(WritableStreamEnd, stream_t, thread, hi)
 
-def canon_future_drop_readable(future_t, task, i):
-  return drop(ReadableFutureEnd, future_t, task, i)
+def canon_future_drop_readable(future_t, thread, i):
+  return drop(ReadableFutureEnd, future_t, thread, i)
 
-def canon_future_drop_writable(future_t, task, hi):
-  return drop(WritableFutureEnd, future_t, task, hi)
+def canon_future_drop_writable(future_t, thread, hi):
+  return drop(WritableFutureEnd, future_t, thread, hi)
 
-def drop(EndT, stream_or_future_t, task, hi):
-  trap_if(not task.inst.may_leave)
-  e = task.inst.table.remove(hi)
+def drop(EndT, stream_or_future_t, thread, hi):
+  trap_if(not thread.task.inst.may_leave)
+  e = thread.task.inst.table.remove(hi)
   trap_if(not isinstance(e, EndT))
   trap_if(e.shared.t != stream_or_future_t.t)
   e.drop()
   return []
 
+### 🧵 `canon thread.index`
+
+def canon_thread_index(thread):
+  assert(thread.index is not None)
+  return [thread.index]
+
+### 🧵 `canon thread.new_indirect`
+
+@dataclass
+class CoreFuncRef:
+  t: CoreFuncType
+  callee: Callable[[Thread, list[CoreValType]], list[CoreValType]]
+
+def canon_thread_new_indirect(ft, ftbl: Table[CoreFuncRef], thread, fi, c):
+  trap_if(not thread.task.inst.may_leave)
+  f = ftbl.get(fi)
+  assert(ft == CoreFuncType(['i32'], []))
+  trap_if(f.t != ft)
+  def thread_func(thread):
+    [] = call_and_trap_on_throw(f.callee, thread, [c])
+  new_thread = Thread(thread.task, thread_func)
+  assert(new_thread.suspended())
+  new_thread.index = thread.task.inst.table.add(new_thread)
+  return [new_thread.index]
+
+### 🧵 `canon thread.switch-to`
+
+class SuspendResult(IntEnum):
+  COMPLETED = 0
+  CANCELLED = 1
+
+def canon_thread_switch_to(cancellable, thread, i):
+  trap_if(not thread.task.inst.may_leave)
+  other_thread = thread.task.inst.table.get(i)
+  trap_if(not isinstance(other_thread, Thread))
+  trap_if(not other_thread.suspended())
+  if not thread.task.switch_to(thread, cancellable, other_thread):
+    assert(cancellable)
+    return [SuspendResult.CANCELLED]
+  else:
+    return [SuspendResult.COMPLETED]
+
+### 🧵 `canon thread.suspend`
+
+def canon_thread_suspend(cancellable, thread):
+  trap_if(not thread.task.inst.may_leave)
+  if not thread.task.suspend(thread, cancellable):
+    assert(cancellable)
+    return [SuspendResult.CANCELLED]
+  else:
+    return [SuspendResult.COMPLETED]
+
+### 🧵 `canon thread.resume-later`
+
+def canon_thread_resume_later(thread, i):
+  trap_if(not thread.task.inst.may_leave)
+  other_thread = thread.task.inst.table.get(i)
+  trap_if(not isinstance(other_thread, Thread))
+  trap_if(not other_thread.suspended())
+  other_thread.resume_later()
+  return []
+
+### 🧵 `canon thread.yield-to`
+
+def canon_thread_yield_to(cancellable, thread, i):
+  trap_if(not thread.task.inst.may_leave)
+  other_thread = thread.task.inst.table.get(i)
+  trap_if(not isinstance(other_thread, Thread))
+  trap_if(not other_thread.suspended())
+  if not thread.task.yield_to(thread, cancellable, other_thread):
+    assert(cancellable)
+    return [SuspendResult.CANCELLED]
+  else:
+    return [SuspendResult.COMPLETED]
+
+### 🧵 `canon thread.yield`
+
+def canon_thread_yield(cancellable, thread):
+  trap_if(not thread.task.inst.may_leave)
+  event_code,_,_ = thread.task.yield_until(lambda: True, thread, cancellable)
+  match event_code:
+    case EventCode.NONE:
+      return [SuspendResult.COMPLETED]
+    case EventCode.TASK_CANCELLED:
+      return [SuspendResult.CANCELLED]
+
 ### 📝 `canon error-context.new`
 
 @dataclass
 class ErrorContext:
   debug_message: String
 
-def canon_error_context_new(opts, task, ptr, tagged_code_units):
-  trap_if(not task.inst.may_leave)
+def canon_error_context_new(opts, thread, ptr, tagged_code_units):
+  trap_if(not thread.task.inst.may_leave)
   if DETERMINISTIC_PROFILE or random.randint(0,1):
     s = String(('', 'utf8', 0))
   else:
-    cx = LiftLowerContext(opts, task.inst)
+    cx = LiftLowerContext(opts, thread.task.inst)
     s = load_string_from_range(cx, ptr, tagged_code_units)
     s = host_defined_transformation(s)
-  i = task.inst.table.add(ErrorContext(s))
+  i = thread.task.inst.table.add(ErrorContext(s))
   return [i]
 
 ### 📝 `canon error-context.debug-message`
 
-def canon_error_context_debug_message(opts, task, i, ptr):
-  trap_if(not task.inst.may_leave)
-  errctx = task.inst.table.get(i)
+def canon_error_context_debug_message(opts, thread, i, ptr):
+  trap_if(not thread.task.inst.may_leave)
+  errctx = thread.task.inst.table.get(i)
   trap_if(not isinstance(errctx, ErrorContext))
-  cx = LiftLowerContext(opts, task.inst)
+  cx = LiftLowerContext(opts, thread.task.inst)
   store_string(cx, errctx.debug_message, ptr)
   return []
 
 ### 📝 `canon error-context.drop`
 
-def canon_error_context_drop(task, i):
-  trap_if(not task.inst.may_leave)
-  errctx = task.inst.table.remove(i)
+def canon_error_context_drop(thread, i):
+  trap_if(not thread.task.inst.may_leave)
+  errctx = thread.task.inst.table.remove(i)
   trap_if(not isinstance(errctx, ErrorContext))
   return []
diff --git a/design/mvp/canonical-abi/diff.py b/design/mvp/canonical-abi/diff.py
index c99ce970..8ca69d2a 100755
--- a/design/mvp/canonical-abi/diff.py
+++ b/design/mvp/canonical-abi/diff.py
@@ -30,8 +30,8 @@ def is_comment_or_empty(line):
 
 def is_canon_thread_function(line):
     normalized = normalize_line(line)
-    return (normalized.startswith('async def canon_thread') or
-            normalized.startswith('def canon_thread'))
+    return (normalized.startswith('def canon_thread_spawn') or
+            normalized.startswith('def canon_thread_available_parallelism'))
 
 def filter_canon_thread_functions(code_blocks):
     filtered_blocks = []
diff --git a/design/mvp/canonical-abi/run_tests.py b/design/mvp/canonical-abi/run_tests.py
index 403df6d7..b5d59ef1 100644
--- a/design/mvp/canonical-abi/run_tests.py
+++ b/design/mvp/canonical-abi/run_tests.py
@@ -54,19 +54,19 @@ def mk_cx(memory = bytearray(), encoding = 'utf8', realloc = None, post_return =
 def run_lift(opts, inst, ft, callee, on_start, on_resolve):
   lifted_func = partial(canon_lift, opts, inst, ft, callee)
   task = inst.store.invoke(lifted_func, None, on_start, on_resolve)
-  while task.thread is not None:
+  while inst.store.pending:
     inst.store.tick()
 
 def mk_task(caller, on_resolve, thread_func):
   inst = ComponentInstance(caller.inst.store)
   task = Task(None, inst, None, caller, on_resolve)
-  thread = Thread(task, partial(thread_func, task))
+  thread = Thread(task, thread_func)
   thread.resume()
   return task
 
 def mk_done_task(caller):
-  def empty(task):
-    task.state = Task.State.RESOLVED
+  def empty(thread):
+    thread.task.state = Task.State.RESOLVED
   return mk_task(caller, lambda _:(), empty)
 
 def mk_str(s):
@@ -404,7 +404,7 @@ def test_roundtrip(t, v):
     store = Store()
 
     ft = FuncType([t],[t])
-    def callee(task, x):
+    def callee(thread, x):
       return x
 
     callee_heap = Heap(1000)
@@ -443,7 +443,7 @@ def test_handles():
   definitions.MAX_FLAT_RESULTS = 16
 
   dtor_value = None
-  def dtor(task, args):
+  def dtor(thread, args):
     nonlocal dtor_value
     assert(len(args) == 1)
     dtor_value = args[0]
@@ -463,19 +463,22 @@ def host_import(caller, on_start, on_resolve):
     on_resolve([45])
     return mk_done_task(caller)
 
-  def core_wasm(task, args):
+  def core_wasm(thread, args):
     nonlocal dtor_value
 
     assert(len(args) == 4)
-    assert(len(inst.table.array) == 4)
+    assert(len(inst.table.array) == 5)
     assert(inst.table.array[0] is None)
-    assert(args[0] == 1)
-    assert(args[1] == 2)
-    assert(args[2] == 3)
+    assert(args[0] == 2)
+    assert(args[1] == 3)
+    assert(args[2] == 4)
     assert(args[3] == 13)
-    assert((canon_resource_rep(rt, task, 1))[0] == 42)
-    assert((canon_resource_rep(rt, task, 2))[0] == 43)
-    assert((canon_resource_rep(rt, task, 3))[0] == 44)
+    h1 = args[0]
+    h2 = args[1]
+    h3 = args[2]
+    assert((canon_resource_rep(rt, thread, h1))[0] == 42)
+    assert((canon_resource_rep(rt, thread, h2))[0] == 43)
+    assert((canon_resource_rep(rt, thread, h3))[0] == 44)
 
     host_ft = FuncType([
       BorrowType(rt),
@@ -484,35 +487,36 @@ def core_wasm(task, args):
       OwnType(rt)
     ])
     args = [
-      1,
-      3
+      h1,
+      h3
     ]
-    results = canon_lower(opts, host_ft, host_import, task, args)
+    results = canon_lower(opts, host_ft, host_import, thread, args)
     assert(len(results) == 1)
-    assert(results[0] == 4)
-    assert((canon_resource_rep(rt, task, 4))[0] == 45)
+    assert(results[0] == 5)
+    h4 = results[0]
+    assert((canon_resource_rep(rt, thread, h4))[0] == 45)
 
     dtor_value = None
-    canon_resource_drop(rt, True, task, 1)
+    canon_resource_drop(rt, True, thread, h1)
     assert(dtor_value == 42)
-    assert(len(inst.table.array) == 5)
-    assert(inst.table.array[1] is None)
+    assert(len(inst.table.array) == 6)
+    assert(inst.table.array[h1] is None)
     assert(len(inst.table.free) == 1)
 
-    h = (canon_resource_new(rt, task, 46))[0]
-    assert(h == 1)
-    assert(len(inst.table.array) == 5)
-    assert(inst.table.array[1] is not None)
+    h = (canon_resource_new(rt, thread, 46))[0]
+    assert(h == h1)
+    assert(len(inst.table.array) == 6)
+    assert(inst.table.array[h] is not None)
     assert(len(inst.table.free) == 0)
 
     dtor_value = None
-    canon_resource_drop(rt, True, task, 3)
+    canon_resource_drop(rt, True, thread, h3)
     assert(dtor_value is None)
-    assert(len(inst.table.array) == 5)
-    assert(inst.table.array[3] is None)
+    assert(len(inst.table.array) == 6)
+    assert(inst.table.array[h3] is None)
     assert(len(inst.table.free) == 1)
 
-    return [1, 2, 4]
+    return [h, h2, h4]
 
   ft = FuncType([
     OwnType(rt),
@@ -539,9 +543,9 @@ def on_resolve(results):
   assert(got[0] == 46)
   assert(got[1] == 43)
   assert(got[2] == 45)
-  assert(len(inst.table.array) == 5)
+  assert(len(inst.table.array) == 6)
   assert(all(inst.table.array[i] is None for i in range(4)))
-  assert(len(inst.table.free) == 4)
+  assert(len(inst.table.free) == 5)
   definitions.MAX_FLAT_RESULTS = before
 
 
@@ -554,33 +558,33 @@ def test_async_to_async():
   producer_inst = ComponentInstance(store)
 
   eager_ft = FuncType([], [U8Type()])
-  def core_eager_producer(task, args):
+  def core_eager_producer(thread, args):
     assert(len(args) == 0)
-    [] = canon_task_return(task, [U8Type()], producer_opts, [43])
+    [] = canon_task_return(thread, [U8Type()], producer_opts, [43])
     return []
   eager_callee = partial(canon_lift, producer_opts, producer_inst, eager_ft, core_eager_producer)
 
   toggle_ft = FuncType([], [])
   fut1_1 = RacyBool(False)
   fut1_2 = RacyBool(False)
-  def core_toggle(task, args):
+  def core_toggle(thread, args):
     assert(len(args) == 0)
-    [] = canon_backpressure_inc(task)
-    task.thread.suspend_until(fut1_1.is_set)
-    [] = canon_task_return(task, [], producer_opts, [])
-    task.thread.suspend_until(fut1_2.is_set)
-    [] = canon_backpressure_dec(task)
+    [] = canon_backpressure_inc(thread)
+    thread.suspend_until(fut1_1.is_set)
+    [] = canon_task_return(thread, [], producer_opts, [])
+    thread.suspend_until(fut1_2.is_set)
+    [] = canon_backpressure_dec(thread)
     return []
   toggle_callee = partial(canon_lift, producer_opts, producer_inst, toggle_ft, core_toggle)
 
   fut2, fut3, fut4 = RacyBool(False), RacyBool(False), RacyBool(False)
   blocking_ft = FuncType([U8Type()], [U8Type()])
-  def core_blocking_producer(task, args):
+  def core_blocking_producer(thread, args):
     [x] = args
     assert(x == 83)
-    task.thread.suspend_until(fut2.is_set)
-    [] = canon_task_return(task, [U8Type()], producer_opts, [44])
-    task.thread.suspend_until(fut3.is_set)
+    thread.suspend_until(fut2.is_set)
+    [] = canon_task_return(thread, [U8Type()], producer_opts, [44])
+    thread.suspend_until(fut3.is_set)
     fut4.set()
     return []
   blocking_callee = partial(canon_lift, producer_opts, producer_inst, blocking_ft, core_blocking_producer)
@@ -589,83 +593,83 @@ def core_blocking_producer(task, args):
   consumer_opts = mk_opts(consumer_heap.memory)
   consumer_opts.sync = False
 
-  def consumer(task, args):
+  def consumer(thread, args):
     [b] = args
-    [seti] = canon_waitable_set_new(task)
+    [seti] = canon_waitable_set_new(thread)
     ptr = consumer_heap.realloc(0, 0, 1, 1)
-    [ret] = canon_lower(consumer_opts, eager_ft, eager_callee, task, [ptr])
+    [ret] = canon_lower(consumer_opts, eager_ft, eager_callee, thread, [ptr])
     assert(ret == Subtask.State.RETURNED)
     u8 = consumer_heap.memory[ptr]
     assert(u8 == 43)
-    [ret] = canon_lower(consumer_opts, toggle_ft, toggle_callee, task, [])
+    [ret] = canon_lower(consumer_opts, toggle_ft, toggle_callee, thread, [])
     state,subi1 = unpack_result(ret)
-    assert(subi1 == 2)
+    assert(subi1 == 3)
     assert(state == Subtask.State.STARTED)
-    [] = canon_waitable_join(task, subi1, seti)
+    [] = canon_waitable_join(thread, subi1, seti)
     retp = ptr
     consumer_heap.memory[retp] = 13
-    [ret] = canon_lower(consumer_opts, blocking_ft, blocking_callee, task, [83, retp])
+    [ret] = canon_lower(consumer_opts, blocking_ft, blocking_callee, thread, [83, retp])
     state,subi2 = unpack_result(ret)
-    assert(subi2 == 3)
+    assert(subi2 == 4)
     assert(state == Subtask.State.STARTING)
     assert(consumer_heap.memory[retp] == 13)
-    [] = canon_waitable_join(task, subi2, seti)
+    [] = canon_waitable_join(thread, subi2, seti)
     fut1_1.set()
 
     waitretp = consumer_heap.realloc(0, 0, 8, 4)
-    [event] = canon_waitable_set_wait(True, consumer_heap.memory, task, seti, waitretp)
+    [event] = canon_waitable_set_wait(True, consumer_heap.memory, thread, seti, waitretp)
     assert(event == EventCode.SUBTASK)
     assert(consumer_heap.memory[waitretp] == subi1)
     assert(consumer_heap.memory[waitretp+4] == Subtask.State.RETURNED)
-    [] = canon_subtask_drop(task, subi1)
+    [] = canon_subtask_drop(thread, subi1)
     fut1_2.set()
 
-    [event] = canon_waitable_set_wait(True, consumer_heap.memory, task, seti, waitretp)
+    [event] = canon_waitable_set_wait(True, consumer_heap.memory, thread, seti, waitretp)
     assert(event == EventCode.SUBTASK)
     assert(consumer_heap.memory[waitretp] == subi2)
     assert(consumer_heap.memory[waitretp+4] == Subtask.State.STARTED)
     assert(consumer_heap.memory[retp] == 13)
     fut2.set()
 
-    [event] = canon_waitable_set_wait(True, consumer_heap.memory, task, seti, waitretp)
+    [event] = canon_waitable_set_wait(True, consumer_heap.memory, thread, seti, waitretp)
     assert(event == EventCode.SUBTASK)
     assert(consumer_heap.memory[waitretp] == subi2)
     assert(consumer_heap.memory[waitretp+4] == Subtask.State.RETURNED)
     assert(consumer_heap.memory[retp] == 44)
-    [] = canon_subtask_drop(task, subi2)
+    [] = canon_subtask_drop(thread, subi2)
     fut3.set()
-    task.thread.suspend_until(fut4.is_set)
+    thread.suspend_until(fut4.is_set)
 
     dtor_fut = RacyBool(False)
     dtor_value = None
-    def dtor(task, args):
+    def dtor(thread, args):
       nonlocal dtor_value
       assert(len(args) == 1)
-      task.thread.suspend_until(dtor_fut.is_set)
+      thread.suspend_until(dtor_fut.is_set)
       dtor_value = args[0]
       return []
     rt = ResourceType(producer_inst, dtor)
 
-    [resi] = canon_resource_new(rt, task, 50)
-    assert(resi == 3)
+    [resi] = canon_resource_new(rt, thread, 50)
+    assert(resi == 4)
     assert(dtor_value is None)
-    [ret] = canon_resource_drop(rt, False, task, resi)
+    [ret] = canon_resource_drop(rt, False, thread, resi)
     state,dtorsubi = unpack_result(ret)
-    assert(dtorsubi == 3)
+    assert(dtorsubi == 4)
     assert(state == Subtask.State.STARTED)
     assert(dtor_value is None)
     dtor_fut.set()
 
-    [] = canon_waitable_join(task, dtorsubi, seti)
-    [event] = canon_waitable_set_wait(True, consumer_heap.memory, task, seti, waitretp)
+    [] = canon_waitable_join(thread, dtorsubi, seti)
+    [event] = canon_waitable_set_wait(True, consumer_heap.memory, thread, seti, waitretp)
     assert(event == EventCode.SUBTASK)
     assert(consumer_heap.memory[waitretp] == dtorsubi)
     assert(consumer_heap.memory[waitretp+4] == Subtask.State.RETURNED)
     assert(dtor_value == 50)
-    [] = canon_subtask_drop(task, dtorsubi)
-    [] = canon_waitable_set_drop(task, seti)
+    [] = canon_subtask_drop(thread, dtorsubi)
+    [] = canon_waitable_set_drop(thread, seti)
 
-    [] = canon_task_return(task, [U8Type()], consumer_opts, [42])
+    [] = canon_task_return(thread, [U8Type()], consumer_opts, [42])
     return []
 
   ft = FuncType([BoolType()],[U8Type()])
@@ -691,10 +695,10 @@ def test_async_callback():
   producer_opts.sync = False
   producer_ft = FuncType([], [])
 
-  def core_producer_pre(fut, task, args):
+  def core_producer_pre(fut, thread, args):
     assert(len(args) == 0)
-    task.thread.suspend_until(fut.is_set)
-    canon_task_return(task, [], producer_opts, [])
+    thread.suspend_until(fut.is_set)
+    canon_task_return(thread, [], producer_opts, [])
     return []
   fut1 = RacyBool(False)
   core_producer1 = partial(core_producer_pre, fut1)
@@ -705,53 +709,55 @@ def core_producer_pre(fut, task, args):
 
   consumer_ft = FuncType([],[U32Type()])
   seti = 0
-  def consumer(task, args):
+  def consumer(thread, args):
     assert(len(args) == 0)
 
-    [ret] = canon_lower(opts, producer_ft, producer1, task, [])
+    [ret] = canon_lower(opts, producer_ft, producer1, thread, [])
     state,subi1 = unpack_result(ret)
-    assert(subi1 == 1)
+    assert(subi1 == 2)
     assert(state == Subtask.State.STARTED)
 
-    [ret] = canon_lower(opts, producer_ft, producer2, task, [])
+    [ret] = canon_lower(opts, producer_ft, producer2, thread, [])
     state,subi2 = unpack_result(ret)
-    assert(subi2 == 2)
+    assert(subi2 == 3)
     assert(state == Subtask.State.STARTED)
 
     nonlocal seti
-    [seti] = canon_waitable_set_new(task)
-    assert(seti == 3)
-    [] = canon_waitable_join(task, subi1, seti)
-    [] = canon_waitable_join(task, subi2, seti)
+    [seti] = canon_waitable_set_new(thread)
+    assert(seti == 4)
+    [] = canon_waitable_join(thread, subi1, seti)
+    [] = canon_waitable_join(thread, subi2, seti)
 
     fut1.set()
-    [] = canon_context_set('i32', 0, task, 42)
+    [] = canon_context_set('i32', 0, thread, 42)
     return [definitions.CallbackCode.WAIT|(seti << 4)]
 
-  def callback(task, args):
+  def callback(thread, args):
     assert(len(args) == 3)
-    [ctx] = canon_context_get('i32', 0, task)
+    [ctx] = canon_context_get('i32', 0, thread)
     match ctx:
       case 42:
         assert(args[0] == EventCode.SUBTASK)
-        assert(args[1] == 1)
+        assert(args[1] == 2)
         assert(args[2] == Subtask.State.RETURNED)
-        canon_subtask_drop(task, 1)
-        [] = canon_context_set('i32', 0, task, 52)
+        subi = args[1]
+        canon_subtask_drop(thread, subi)
+        [] = canon_context_set('i32', 0, thread, 52)
         return [definitions.CallbackCode.YIELD]
       case 52:
         assert(args[0] == EventCode.NONE)
         assert(args[1] == 0)
         assert(args[2] == 0)
         fut2.set()
-        [] = canon_context_set('i32', 0, task, 62)
+        [] = canon_context_set('i32', 0, thread, 62)
         return [definitions.CallbackCode.WAIT | (seti << 4)]
       case 62:
         assert(args[0] == EventCode.SUBTASK)
-        assert(args[1] == 2)
+        assert(args[1] == 3)
         assert(args[2] == Subtask.State.RETURNED)
-        canon_subtask_drop(task, 2)
-        [] = canon_task_return(task, [U32Type()], opts, [83])
+        subi = args[1]
+        canon_subtask_drop(thread, subi)
+        [] = canon_task_return(thread, [U32Type()], opts, [83])
         return [definitions.CallbackCode.EXIT]
       case _:
         assert(False)
@@ -777,36 +783,36 @@ def test_callback_interleaving():
   producer_inst = ComponentInstance(store)
   producer_ft = FuncType([U32Type(), FutureType(None),FutureType(None),FutureType(None)],[U32Type()])
   fut3s = [None,None]
-  def core_producer(task, args):
+  def core_producer(thread, args):
     [i,fut1,fut2,fut3] = args
     fut3s[i] = fut3
 
-    [] = canon_context_set('i32', 0, task, i)
+    [] = canon_context_set('i32', 0, thread, i)
 
     sync_opts = mk_opts()
-    [ret] = canon_future_read(FutureType(None), sync_opts, task, fut1, 0xdeadbeef)
+    [ret] = canon_future_read(FutureType(None), sync_opts, thread, fut1, 0xdeadbeef)
     assert(ret == CopyResult.COMPLETED)
 
-    [seti] = canon_waitable_set_new(task)
+    [seti] = canon_waitable_set_new(thread)
 
     async_opts = mk_opts(sync = False)
-    [ret] = canon_future_read(FutureType(None), async_opts, task, fut2, 0xdeadbeef)
+    [ret] = canon_future_read(FutureType(None), async_opts, thread, fut2, 0xdeadbeef)
     assert(ret == definitions.BLOCKED)
 
-    [] = canon_waitable_join(task, fut2, seti)
+    [] = canon_waitable_join(thread, fut2, seti)
     return [CallbackCode.WAIT|(seti << 4)]
 
-  def core_producer_callback(task, args):
+  def core_producer_callback(thread, args):
     [event,payload1,payload2] = args
     assert(event == EventCode.FUTURE_READ)
     assert(payload2 == CopyResult.COMPLETED)
 
-    [i] = canon_context_get('i32', 0, task)
-    [] = canon_task_return(task, [U32Type()], mk_opts(), [42 + i])
+    [i] = canon_context_get('i32', 0, thread)
+    [] = canon_task_return(thread, [U32Type()], mk_opts(), [42 + i])
 
     fut3 = fut3s[i]
     sync_opts = mk_opts()
-    [ret] = canon_future_read(FutureType(None), sync_opts, task, fut3, 0xdeadbeef)
+    [ret] = canon_future_read(FutureType(None), sync_opts, thread, fut3, 0xdeadbeef)
     assert(ret == CopyResult.COMPLETED)
 
     return [CallbackCode.EXIT]
@@ -816,7 +822,7 @@ def core_producer_callback(task, args):
   producer_callee = partial(canon_lift, producer_opts, producer_inst, producer_ft, core_producer)
 
   sync_callee_ft = FuncType([], [U32Type()])
-  def core_sync_callee(task, args):
+  def core_sync_callee(thread, args):
     assert(len(args) == 0)
     return [100]
   sync_callee_opts = mk_opts()
@@ -826,113 +832,113 @@ def core_sync_callee(task, args):
   consumer_ft = FuncType([], [])
   consumer_mem = bytearray(24)
   consumer_opts = mk_opts(consumer_mem, sync = False)
-  def core_consumer(task, args):
+  def core_consumer(thread, args):
     assert(len(args) == 0)
 
-    [packed] = canon_future_new(FutureType(None), task)
+    [packed] = canon_future_new(FutureType(None), thread)
     rfut11,wfut11 = unpack_new_ends(packed)
-    [packed] = canon_future_new(FutureType(None), task)
+    [packed] = canon_future_new(FutureType(None), thread)
     rfut12,wfut12 = unpack_new_ends(packed)
-    [packed] = canon_future_new(FutureType(None), task)
+    [packed] = canon_future_new(FutureType(None), thread)
     rfut13,wfut13 = unpack_new_ends(packed)
-    [packed] = canon_future_new(FutureType(None), task)
+    [packed] = canon_future_new(FutureType(None), thread)
     rfut21,wfut21 = unpack_new_ends(packed)
-    [packed] = canon_future_new(FutureType(None), task)
+    [packed] = canon_future_new(FutureType(None), thread)
     rfut22,wfut22 = unpack_new_ends(packed)
-    [packed] = canon_future_new(FutureType(None), task)
+    [packed] = canon_future_new(FutureType(None), thread)
     rfut23,wfut23 = unpack_new_ends(packed)
 
     producer_inst.backpressure = True
-    [ret] = canon_lower(consumer_opts, producer_ft, producer_callee, task, [0, rfut11, rfut12, rfut13, 0xdeadbeef])
+    [ret] = canon_lower(consumer_opts, producer_ft, producer_callee, thread, [0, rfut11, rfut12, rfut13, 0xdeadbeef])
     state,todie = unpack_result(ret)
     assert(state == Subtask.State.STARTING)
-    [ret] = canon_subtask_cancel(True, task, todie)
+    [ret] = canon_subtask_cancel(True, thread, todie)
     assert(ret == Subtask.State.CANCELLED_BEFORE_STARTED)
     producer_inst.backpressure = False
 
     subi1ret = 12
-    [ret] = canon_lower(consumer_opts, producer_ft, producer_callee, task, [0, rfut11, rfut12, rfut13, subi1ret])
+    [ret] = canon_lower(consumer_opts, producer_ft, producer_callee, thread, [0, rfut11, rfut12, rfut13, subi1ret])
     state,subi1 = unpack_result(ret)
     assert(state == Subtask.State.STARTED)
 
-    [ret] = canon_lower(consumer_opts, producer_ft, producer_callee, task, [1, rfut21, rfut22, rfut23, 0xdeadbeef])
+    [ret] = canon_lower(consumer_opts, producer_ft, producer_callee, thread, [1, rfut21, rfut22, rfut23, 0xdeadbeef])
     state,todie = unpack_result(ret)
     assert(state == Subtask.State.STARTING)
 
-    [ret] = canon_subtask_cancel(True, task, todie)
+    [ret] = canon_subtask_cancel(True, thread, todie)
     assert(ret == Subtask.State.CANCELLED_BEFORE_STARTED)
 
     subi2ret = 16
-    [ret] = canon_lower(consumer_opts, producer_ft, producer_callee, task, [1, rfut21, rfut22, rfut23, subi2ret])
+    [ret] = canon_lower(consumer_opts, producer_ft, producer_callee, thread, [1, rfut21, rfut22, rfut23, subi2ret])
     state,subi2 = unpack_result(ret)
     assert(state == Subtask.State.STARTING)
 
-    [seti] = canon_waitable_set_new(task)
-    [] = canon_waitable_join(task, subi1, seti)
-    [] = canon_waitable_join(task, subi2, seti)
+    [seti] = canon_waitable_set_new(thread)
+    [] = canon_waitable_join(thread, subi1, seti)
+    [] = canon_waitable_join(thread, subi2, seti)
 
-    [ret] = canon_future_write(FutureType(None), consumer_opts, task, wfut11, 0xdeadbeef)
+    [ret] = canon_future_write(FutureType(None), consumer_opts, thread, wfut11, 0xdeadbeef)
     assert(ret == CopyResult.COMPLETED)
 
     retp = 0
-    [event] = canon_waitable_set_wait(True, consumer_mem, task, seti, retp)
+    [event] = canon_waitable_set_wait(True, consumer_mem, thread, seti, retp)
     assert(event == EventCode.SUBTASK)
     assert(consumer_mem[retp+0] == subi2)
     assert(consumer_mem[retp+4] == Subtask.State.STARTED)
 
-    [ret] = canon_future_write(FutureType(None), consumer_opts, task, wfut12, 0xdeadbeef)
+    [ret] = canon_future_write(FutureType(None), consumer_opts, thread, wfut12, 0xdeadbeef)
     assert(ret == CopyResult.COMPLETED)
 
     for i in range(10):
-      [ret] = canon_yield(True, task)
+      [ret] = canon_thread_yield(True, thread)
       assert(ret == 0)
       retp = 0
-      [ret] = canon_waitable_set_poll(True, consumer_mem, task, seti, retp)
+      [ret] = canon_waitable_set_poll(True, consumer_mem, thread, seti, retp)
       assert(ret == EventCode.NONE)
 
-    [ret] = canon_future_write(FutureType(None), consumer_opts, task, wfut21, 0xdeadbeef)
+    [ret] = canon_future_write(FutureType(None), consumer_opts, thread, wfut21, 0xdeadbeef)
     assert(ret == CopyResult.COMPLETED)
 
     retp = 0
-    [event] = canon_waitable_set_wait(True, consumer_mem, task, seti, retp)
+    [event] = canon_waitable_set_wait(True, consumer_mem, thread, seti, retp)
     assert(event == EventCode.SUBTASK)
     assert(consumer_mem[retp+0] == subi1)
     assert(consumer_mem[retp+4] == Subtask.State.RETURNED)
     assert(consumer_mem[subi1ret] == 42)
-    [] = canon_subtask_drop(task, subi1)
+    [] = canon_subtask_drop(thread, subi1)
 
-    [ret] = canon_future_write(FutureType(None), consumer_opts, task, wfut22, 0xdeadbeef)
+    [ret] = canon_future_write(FutureType(None), consumer_opts, thread, wfut22, 0xdeadbeef)
     assert(ret == CopyResult.COMPLETED)
 
     for i in range(10):
-      [ret] = canon_yield(True, task)
+      [ret] = canon_thread_yield(True, thread)
       assert(ret == 0)
       retp = 0
-      [ret] = canon_waitable_set_poll(True, consumer_mem, task, seti, retp)
+      [ret] = canon_waitable_set_poll(True, consumer_mem, thread, seti, retp)
       assert(ret == EventCode.NONE)
 
-    [ret] = canon_future_write(FutureType(None), consumer_opts, task, wfut13, 0xdeadbeef)
+    [ret] = canon_future_write(FutureType(None), consumer_opts, thread, wfut13, 0xdeadbeef)
     assert(ret == CopyResult.COMPLETED)
 
     retp = 0
-    [event] = canon_waitable_set_wait(True, consumer_mem, task, seti, retp)
+    [event] = canon_waitable_set_wait(True, consumer_mem, thread, seti, retp)
     assert(event == EventCode.SUBTASK)
     assert(consumer_mem[retp+0] == subi2)
     assert(consumer_mem[retp+4] == Subtask.State.RETURNED)
     assert(consumer_mem[subi2ret] == 43)
-    [] = canon_subtask_drop(task, subi2)
+    [] = canon_subtask_drop(thread, subi2)
 
     subi3ret = 20
-    [ret] = canon_lower(consumer_opts, sync_callee_ft, sync_callee, task, [subi3ret])
+    [ret] = canon_lower(consumer_opts, sync_callee_ft, sync_callee, thread, [subi3ret])
     state,subi3 = unpack_result(ret)
     assert(state == Subtask.State.STARTING)
-    [] = canon_waitable_join(task, subi3, seti)
+    [] = canon_waitable_join(thread, subi3, seti)
 
-    [ret] = canon_future_write(FutureType(None), consumer_opts, task, wfut23, 0xdeadbeef)
+    [ret] = canon_future_write(FutureType(None), consumer_opts, thread, wfut23, 0xdeadbeef)
     assert(ret == CopyResult.COMPLETED)
 
     retp = 0
-    [event] = canon_waitable_set_wait(True, consumer_mem, task, seti, retp)
+    [event] = canon_waitable_set_wait(True, consumer_mem, thread, seti, retp)
     assert(event == EventCode.SUBTASK)
     assert(consumer_mem[retp+0] == subi3)
     assert(consumer_mem[retp+4] == Subtask.State.RETURNED)
@@ -951,15 +957,15 @@ def test_async_to_sync():
   producer_ft = FuncType([],[])
   fut = RacyBool(False)
   producer1_done = False
-  def producer1_core(task, args):
+  def producer1_core(thread, args):
     nonlocal producer1_done
     assert(len(args) == 0)
-    task.thread.suspend_until(fut.is_set)
+    thread.suspend_until(fut.is_set)
     producer1_done = True
     return []
 
   producer2_done = False
-  def producer2_core(task, args):
+  def producer2_core(thread, args):
     nonlocal producer2_done
     assert(len(args) == 0)
     assert(producer1_done == True)
@@ -974,22 +980,22 @@ def producer2_core(task, args):
   consumer_opts.sync = False
 
   consumer_ft = FuncType([],[U8Type()])
-  def consumer(task, args):
+  def consumer(thread, args):
     assert(len(args) == 0)
 
-    [ret] = canon_lower(consumer_opts, producer_ft, producer1, task, [])
+    [ret] = canon_lower(consumer_opts, producer_ft, producer1, thread, [])
     state,subi1 = unpack_result(ret)
-    assert(subi1 == 1)
+    assert(subi1 == 2)
     assert(state == Subtask.State.STARTED)
 
-    [ret] = canon_lower(consumer_opts, producer_ft, producer2, task, [])
+    [ret] = canon_lower(consumer_opts, producer_ft, producer2, thread, [])
     state,subi2 = unpack_result(ret)
-    assert(subi2 == 2)
+    assert(subi2 == 3)
     assert(state == Subtask.State.STARTING)
 
-    [seti] = canon_waitable_set_new(task)
-    [] = canon_waitable_join(task, subi1, seti)
-    [] = canon_waitable_join(task, subi2, seti)
+    [seti] = canon_waitable_set_new(thread)
+    [] = canon_waitable_join(thread, subi1, seti)
+    [] = canon_waitable_join(thread, subi2, seti)
 
     fut.set()
     assert(producer1_done == False)
@@ -998,21 +1004,21 @@ def consumer(task, args):
     remain = [subi1, subi2]
     while remain:
       retp = 8
-      [event] = canon_waitable_set_poll(True, consumer_heap.memory, task, seti, retp)
+      [event] = canon_waitable_set_poll(True, consumer_heap.memory, thread, seti, retp)
       if event == EventCode.NONE:
         continue
       assert(event == EventCode.SUBTASK)
       assert(consumer_heap.memory[retp+4] == Subtask.State.RETURNED)
       subi = consumer_heap.memory[retp]
       remain.remove(subi)
-      canon_subtask_drop(task, subi)
+      canon_subtask_drop(thread, subi)
 
     assert(producer1_done == True)
     assert(producer2_done == True)
 
-    [] = canon_waitable_set_drop(task, seti)
+    [] = canon_waitable_set_drop(thread, seti)
 
-    canon_task_return(task, [U8Type()], consumer_opts, [83])
+    canon_task_return(thread, [U8Type()], consumer_opts, [83])
     return []
 
   consumer_inst = ComponentInstance(store)
@@ -1036,20 +1042,20 @@ def test_async_backpressure():
   producer_ft = FuncType([],[])
   fut = RacyBool(False)
   producer1_done = False
-  def producer1_core(task, args):
+  def producer1_core(thread, args):
     nonlocal producer1_done
-    canon_backpressure_inc(task)
-    task.thread.suspend_until(fut.is_set)
-    canon_backpressure_dec(task)
-    canon_task_return(task, [], producer_opts, [])
+    canon_backpressure_inc(thread)
+    thread.suspend_until(fut.is_set)
+    canon_backpressure_dec(thread)
+    canon_task_return(thread, [], producer_opts, [])
     producer1_done = True
     return []
 
   producer2_done = False
-  def producer2_core(task, args):
+  def producer2_core(thread, args):
     nonlocal producer2_done
     assert(producer1_done == True)
-    canon_task_return(task, [], producer_opts, [])
+    canon_task_return(thread, [], producer_opts, [])
     producer2_done = True
     return []
 
@@ -1060,22 +1066,22 @@ def producer2_core(task, args):
   consumer_opts = mk_opts(consumer_heap.memory, sync = False)
 
   consumer_ft = FuncType([],[U8Type()])
-  def consumer(task, args):
+  def consumer(thread, args):
     assert(len(args) == 0)
 
-    [ret] = canon_lower(consumer_opts, producer_ft, producer1, task, [])
+    [ret] = canon_lower(consumer_opts, producer_ft, producer1, thread, [])
     state,subi1 = unpack_result(ret)
-    assert(subi1 == 1)
+    assert(subi1 == 2)
     assert(state == Subtask.State.STARTED)
 
-    [ret] = canon_lower(consumer_opts, producer_ft, producer2, task, [])
+    [ret] = canon_lower(consumer_opts, producer_ft, producer2, thread, [])
     state,subi2 = unpack_result(ret)
-    assert(subi2 == 2)
+    assert(subi2 == 3)
     assert(state == Subtask.State.STARTING)
 
-    [seti] = canon_waitable_set_new(task)
-    [] = canon_waitable_join(task, subi1, seti)
-    [] = canon_waitable_join(task, subi2, seti)
+    [seti] = canon_waitable_set_new(thread)
+    [] = canon_waitable_join(thread, subi1, seti)
+    [] = canon_waitable_join(thread, subi2, seti)
 
     fut.set()
     assert(producer1_done == False)
@@ -1084,19 +1090,19 @@ def consumer(task, args):
     remain = [subi1, subi2]
     while remain:
       retp = 8
-      [event] = canon_waitable_set_wait(True, consumer_heap.memory, task, seti, retp)
+      [event] = canon_waitable_set_wait(True, consumer_heap.memory, thread, seti, retp)
       assert(event == EventCode.SUBTASK)
       assert(consumer_heap.memory[retp+4] == Subtask.State.RETURNED)
       subi = consumer_heap.memory[retp]
       remain.remove(subi)
-      canon_subtask_drop(task, subi)
+      canon_subtask_drop(thread, subi)
 
     assert(producer1_done == True)
     assert(producer2_done == True)
 
-    [] = canon_waitable_set_drop(task, seti)
+    [] = canon_waitable_set_drop(thread, seti)
 
-    canon_task_return(task, [U8Type()], consumer_opts, [84])
+    canon_task_return(thread, [U8Type()], consumer_opts, [84])
     return []
 
   consumer_inst = ComponentInstance(store)
@@ -1118,9 +1124,9 @@ def test_sync_using_wait():
   hostcall_inst = ComponentInstance(store)
   ft = FuncType([], [])
 
-  def core_hostcall_pre(fut, task, args):
-    task.thread.suspend_until(fut.is_set)
-    [] = canon_task_return(task, [], hostcall_opts, [])
+  def core_hostcall_pre(fut, thread, args):
+    thread.suspend_until(fut.is_set)
+    [] = canon_task_return(thread, [], hostcall_opts, [])
     return []
   fut1 = RacyBool(False)
   core_hostcall1 = partial(core_hostcall_pre, fut1)
@@ -1133,38 +1139,38 @@ def core_hostcall_pre(fut, task, args):
   lower_opts = mk_opts(lower_heap.memory)
   lower_opts.sync = False
 
-  def core_func(task, args):
-    [ret] = canon_lower(lower_opts, ft, hostcall1, task, [])
+  def core_func(thread, args):
+    [ret] = canon_lower(lower_opts, ft, hostcall1, thread, [])
     state,subi1 = unpack_result(ret)
-    assert(subi1 == 1)
+    assert(subi1 == 2)
     assert(state == Subtask.State.STARTED)
-    [ret] = canon_lower(lower_opts, ft, hostcall2, task, [])
+    [ret] = canon_lower(lower_opts, ft, hostcall2, thread, [])
     state,subi2 = unpack_result(ret)
-    assert(subi2 == 2)
+    assert(subi2 == 3)
     assert(state == Subtask.State.STARTED)
 
-    [seti] = canon_waitable_set_new(task)
-    [] = canon_waitable_join(task, subi1, seti)
-    [] = canon_waitable_join(task, subi2, seti)
+    [seti] = canon_waitable_set_new(thread)
+    [] = canon_waitable_join(thread, subi1, seti)
+    [] = canon_waitable_join(thread, subi2, seti)
 
     fut1.set()
 
     retp = lower_heap.realloc(0,0,8,4)
-    [event] = canon_waitable_set_wait(True, lower_heap.memory, task, seti, retp)
+    [event] = canon_waitable_set_wait(True, lower_heap.memory, thread, seti, retp)
     assert(event == EventCode.SUBTASK)
     assert(lower_heap.memory[retp] == subi1)
     assert(lower_heap.memory[retp+4] == Subtask.State.RETURNED)
 
     fut2.set()
 
-    [event] = canon_waitable_set_wait(True, lower_heap.memory, task, seti, retp)
+    [event] = canon_waitable_set_wait(True, lower_heap.memory, thread, seti, retp)
     assert(event == EventCode.SUBTASK)
     assert(lower_heap.memory[retp] == subi2)
     assert(lower_heap.memory[retp+4] == Subtask.State.RETURNED)
 
-    canon_subtask_drop(task, subi1)
-    canon_subtask_drop(task, subi2)
-    canon_waitable_set_drop(task, seti)
+    canon_subtask_drop(thread, subi1)
+    canon_subtask_drop(thread, subi2)
+    canon_waitable_set_drop(thread, seti)
 
     return []
 
@@ -1364,49 +1370,49 @@ def on_resolve(results):
     nonlocal dst_stream
     dst_stream = HostSink(results[0], chunk=4)
 
-  def core_func(task, args):
+  def core_func(thread, args):
     assert(len(args) == 1)
     rsi1 = args[0]
-    assert(rsi1 == 1)
-    [packed] = canon_stream_new(StreamType(U8Type()), task)
+    assert(rsi1 == 2)
+    [packed] = canon_stream_new(StreamType(U8Type()), thread)
     rsi2,wsi2 = unpack_new_ends(packed)
-    [] = canon_task_return(task, [StreamType(U8Type())], opts, [rsi2])
-    [ret] = canon_stream_read(StreamType(U8Type()), opts, task, rsi1, 0, 4)
+    [] = canon_task_return(thread, [StreamType(U8Type())], opts, [rsi2])
+    [ret] = canon_stream_read(StreamType(U8Type()), opts, thread, rsi1, 0, 4)
     result,n = unpack_result(ret)
     assert(n == 4 and result == CopyResult.COMPLETED)
     assert(mem[0:4] == b'\x01\x02\x03\x04')
-    [packed] = canon_stream_new(StreamType(U8Type()), task)
+    [packed] = canon_stream_new(StreamType(U8Type()), thread)
     rsi3,wsi3 = unpack_new_ends(packed)
     retp = 12
-    [ret] = canon_lower(opts, ft, host_import, task, [rsi3, retp])
+    [ret] = canon_lower(opts, ft, host_import, thread, [rsi3, retp])
     assert(ret == Subtask.State.RETURNED)
     rsi4 = mem[retp]
-    [ret] = canon_stream_write(StreamType(U8Type()), opts, task, wsi3, 0, 4)
+    [ret] = canon_stream_write(StreamType(U8Type()), opts, thread, wsi3, 0, 4)
     result,n = unpack_result(ret)
     assert(n == 4 and result == CopyResult.COMPLETED)
-    [ret] = canon_stream_read(StreamType(U8Type()), sync_opts, task, rsi4, 0, 4)
+    [ret] = canon_stream_read(StreamType(U8Type()), sync_opts, thread, rsi4, 0, 4)
     result,n = unpack_result(ret)
     assert(n == 4 and result == CopyResult.COMPLETED)
-    [ret] = canon_stream_write(StreamType(U8Type()), opts, task, wsi2, 0, 4)
+    [ret] = canon_stream_write(StreamType(U8Type()), opts, thread, wsi2, 0, 4)
     result,n = unpack_result(ret)
     assert(n == 4 and result == CopyResult.COMPLETED)
-    [ret] = canon_stream_read(StreamType(U8Type()), opts, task, rsi1, 0, 4)
+    [ret] = canon_stream_read(StreamType(U8Type()), opts, thread, rsi1, 0, 4)
     result,n = unpack_result(ret)
     assert(n == 4 and result == CopyResult.DROPPED)
     assert(mem[0:4] == b'\x05\x06\x07\x08')
-    [ret] = canon_stream_write(StreamType(U8Type()), sync_opts, task, wsi3, 0, 4)
+    [ret] = canon_stream_write(StreamType(U8Type()), sync_opts, thread, wsi3, 0, 4)
     result,n = unpack_result(ret)
     assert(n == 4 and result == CopyResult.COMPLETED)
-    [ret] = canon_stream_read(StreamType(U8Type()), sync_opts, task, rsi4, 0, 4)
+    [ret] = canon_stream_read(StreamType(U8Type()), sync_opts, thread, rsi4, 0, 4)
     result,n = unpack_result(ret)
     assert(n == 4 and result == CopyResult.COMPLETED)
-    [ret] = canon_stream_write(StreamType(U8Type()), sync_opts, task, wsi2, 0, 4)
+    [ret] = canon_stream_write(StreamType(U8Type()), sync_opts, thread, wsi2, 0, 4)
     result,n = unpack_result(ret)
     assert(n == 4 and result == CopyResult.COMPLETED)
-    [] = canon_stream_drop_readable(StreamType(U8Type()), task, rsi1)
-    [] = canon_stream_drop_readable(StreamType(U8Type()), task, rsi4)
-    [] = canon_stream_drop_writable(StreamType(U8Type()), task, wsi2)
-    [] = canon_stream_drop_writable(StreamType(U8Type()), task, wsi3)
+    [] = canon_stream_drop_readable(StreamType(U8Type()), thread, rsi1)
+    [] = canon_stream_drop_readable(StreamType(U8Type()), thread, rsi4)
+    [] = canon_stream_drop_writable(StreamType(U8Type()), thread, wsi2)
+    [] = canon_stream_drop_writable(StreamType(U8Type()), thread, wsi3)
     return []
 
   run_lift(opts, inst, ft, core_func, on_start, on_resolve)
@@ -1424,14 +1430,14 @@ def test_async_stream_ops():
   host_import_incoming = None
   host_import_outgoing = None
   def host_import(caller, on_start, on_resolve):
-    def thread_func(task):
+    def thread_func(thread):
       nonlocal host_import_incoming, host_import_outgoing
       args = on_start()
       assert(len(args) == 1)
       assert(isinstance(args[0], ReadableStream))
       host_import_incoming = HostSink(args[0], chunk=4, remain = 0)
       host_import_outgoing = HostSource(U8Type(), [], chunk=4, destroy_if_empty=False)
-      task.return_([host_import_outgoing])
+      thread.task.return_([host_import_outgoing])
       while True:
         vs = None
         results_ready = RacyBool(False)
@@ -1440,7 +1446,7 @@ def consume_results():
           vs = host_import_incoming.consume(4)
           results_ready.set()
         threading.Thread(target = consume_results).start()
-        task.thread.suspend_until(results_ready.is_set)
+        thread.suspend_until(results_ready.is_set)
         if vs:
           for i in range(len(vs)):
             vs[i] += 10
@@ -1461,79 +1467,79 @@ def on_resolve(results):
     nonlocal dst_stream
     dst_stream = HostSink(results[0], chunk=4, remain = 0)
 
-  def core_func(task, args):
+  def core_func(thread, args):
     [rsi1] = args
-    assert(rsi1 == 1)
-    [packed] = canon_stream_new(StreamType(U8Type()), task)
+    assert(rsi1 == 2)
+    [packed] = canon_stream_new(StreamType(U8Type()), thread)
     rsi2,wsi2 = unpack_new_ends(packed)
-    [] = canon_task_return(task, [StreamType(U8Type())], opts, [rsi2])
-    [ret] = canon_stream_read(StreamType(U8Type()), opts, task, rsi1, 0, 4)
+    [] = canon_task_return(thread, [StreamType(U8Type())], opts, [rsi2])
+    [ret] = canon_stream_read(StreamType(U8Type()), opts, thread, rsi1, 0, 4)
     assert(ret == definitions.BLOCKED)
     src_stream.write([1,2,3,4])
     retp = 16
-    [seti] = canon_waitable_set_new(task)
-    [] = canon_waitable_join(task, rsi1, seti)
+    [seti] = canon_waitable_set_new(thread)
+    [] = canon_waitable_join(thread, rsi1, seti)
     definitions.throw_it = True
-    [event] = canon_waitable_set_wait(True, mem, task, seti, retp)
+    [event] = canon_waitable_set_wait(True, mem, thread, seti, retp)
     assert(event == EventCode.STREAM_READ)
     assert(mem[retp+0] == rsi1)
     result,n = unpack_result(mem[retp+4])
     assert(n == 4 and result == CopyResult.COMPLETED)
     assert(mem[0:4] == b'\x01\x02\x03\x04')
-    [packed] = canon_stream_new(StreamType(U8Type()), task)
+    [packed] = canon_stream_new(StreamType(U8Type()), thread)
     rsi3,wsi3 = unpack_new_ends(packed)
-    [ret] = canon_lower(opts, ft, host_import, task, [rsi3, retp])
+    [ret] = canon_lower(opts, ft, host_import, thread, [rsi3, retp])
     assert(ret == Subtask.State.RETURNED)
     rsi4 = mem[16]
-    assert(rsi4 == 4)
-    [ret] = canon_stream_write(StreamType(U8Type()), opts, task, wsi3, 0, 4)
+    assert(rsi4 == 5)
+    [ret] = canon_stream_write(StreamType(U8Type()), opts, thread, wsi3, 0, 4)
     assert(ret == definitions.BLOCKED)
     host_import_incoming.set_remain(100)
-    [] = canon_waitable_join(task, wsi3, seti)
-    [event] = canon_waitable_set_wait(True, mem, task, seti, retp)
+    [] = canon_waitable_join(thread, wsi3, seti)
+    [event] = canon_waitable_set_wait(True, mem, thread, seti, retp)
     assert(event == EventCode.STREAM_WRITE)
     assert(mem[retp+0] == wsi3)
     result,n = unpack_result(mem[retp+4])
     assert(n == 4 and result == CopyResult.COMPLETED)
-    [ret] = canon_stream_read(StreamType(U8Type()), sync_opts, task, rsi4, 0, 4)
+    [ret] = canon_stream_read(StreamType(U8Type()), sync_opts, thread, rsi4, 0, 4)
     result,n = unpack_result(ret)
     assert(n == 4 and result == CopyResult.COMPLETED)
-    [ret] = canon_stream_write(StreamType(U8Type()), opts, task, wsi2, 0, 4)
+    [ret] = canon_stream_write(StreamType(U8Type()), opts, thread, wsi2, 0, 4)
     assert(ret == definitions.BLOCKED)
     dst_stream.set_remain(100)
-    [] = canon_waitable_join(task, wsi2, seti)
-    [event] = canon_waitable_set_wait(True, mem, task, seti, retp)
+    [] = canon_waitable_join(thread, wsi2, seti)
+    [event] = canon_waitable_set_wait(True, mem, thread, seti, retp)
     assert(event == EventCode.STREAM_WRITE)
     assert(mem[retp+0] == wsi2)
     result,n = unpack_result(mem[retp+4])
     assert(n == 4 and result == CopyResult.COMPLETED)
     src_stream.write([5,6,7,8])
     src_stream.destroy_once_empty()
-    [ret] = canon_stream_read(StreamType(U8Type()), opts, task, rsi1, 0, 4)
+    [ret] = canon_stream_read(StreamType(U8Type()), opts, thread, rsi1, 0, 4)
     result,n = unpack_result(ret)
     assert(n == 4 and result == CopyResult.DROPPED)
-    [] = canon_stream_drop_readable(StreamType(U8Type()), task, rsi1)
+    [] = canon_stream_drop_readable(StreamType(U8Type()), thread, rsi1)
     assert(mem[0:4] == b'\x05\x06\x07\x08')
-    [ret] = canon_stream_write(StreamType(U8Type()), opts, task, wsi3, 0, 4)
+    [ret] = canon_stream_write(StreamType(U8Type()), opts, thread, wsi3, 0, 4)
     result,n = unpack_result(ret)
     assert(n == 4 and result == CopyResult.COMPLETED)
-    [] = canon_stream_drop_writable(StreamType(U8Type()), task, wsi3)
-    [ret] = canon_stream_read(StreamType(U8Type()), opts, task, rsi4, 0, 4)
+    [] = canon_stream_drop_writable(StreamType(U8Type()), thread, wsi3)
+    [ret] = canon_stream_read(StreamType(U8Type()), opts, thread, rsi4, 0, 4)
     assert(ret == definitions.BLOCKED)
-    [] = canon_waitable_join(task, rsi4, seti)
-    [event] = canon_waitable_set_wait(True, mem, task, seti, retp)
+    [] = canon_waitable_join(thread, rsi4, seti)
+    [event] = canon_waitable_set_wait(True, mem, thread, seti, retp)
     assert(event == EventCode.STREAM_READ)
     assert(mem[retp+0] == rsi4)
     result,n = unpack_result(mem[retp+4])
     assert(n == 4 and result == CopyResult.COMPLETED)
-    [ret] = canon_stream_read(StreamType(U8Type()), sync_opts, task, rsi4, 0, 4)
+    [ret] = canon_stream_read(StreamType(U8Type()), sync_opts, thread, rsi4, 0, 4)
     assert(ret == CopyResult.DROPPED)
-    [] = canon_stream_drop_readable(StreamType(U8Type()), task, rsi4)
-    [ret] = canon_stream_write(StreamType(U8Type()), opts, task, wsi2, 0, 4)
+    [] = canon_stream_drop_readable(StreamType(U8Type()), thread, rsi4)
+    [ret] = canon_stream_write(StreamType(U8Type()), opts, thread, wsi2, 0, 4)
     result,n = unpack_result(ret)
     assert(n == 4 and result == CopyResult.COMPLETED)
-    [] = canon_stream_drop_writable(StreamType(U8Type()), task, wsi2)
-    [] = canon_waitable_set_drop(task, seti)
+    [] = canon_stream_drop_writable(StreamType(U8Type()), thread, wsi2)
+    [] = canon_waitable_set_drop(thread, seti)
     return []
 
   run_lift(opts, inst, ft, core_func, on_start, on_resolve)
@@ -1551,10 +1557,10 @@ def on_resolve(results):
     nonlocal dst_stream
     dst_stream = results[0]
 
-  def core_func(task, args):
+  def core_func(thread, args):
     assert(len(args) == 1)
     rsi1 = args[0]
-    assert(rsi1 == 1)
+    assert(rsi1 == 2)
     return [rsi1]
 
   opts = mk_opts()
@@ -1578,24 +1584,24 @@ def host_import(caller, on_start, on_resolve):
     on_resolve(args)
     return mk_done_task(caller)
 
-  def core_func(task, args):
+  def core_func(thread, args):
     assert(len(args) == 0)
-    [packed] = canon_stream_new(StreamType(U8Type()), task)
+    [packed] = canon_stream_new(StreamType(U8Type()), thread)
     rsi,wsi = unpack_new_ends(packed)
-    assert(rsi == 1)
-    assert(wsi == 2)
-    [ret] = canon_stream_write(StreamType(U8Type()), opts, task, wsi, 0, 4)
+    assert(rsi == 2)
+    assert(wsi == 3)
+    [ret] = canon_stream_write(StreamType(U8Type()), opts, thread, wsi, 0, 4)
     assert(ret == definitions.BLOCKED)
     retp = 8
-    [ret] = canon_lower(opts, host_ft, host_import, task, [rsi, retp])
+    [ret] = canon_lower(opts, host_ft, host_import, thread, [rsi, retp])
     assert(ret == Subtask.State.RETURNED)
     rsi2 = int.from_bytes(mem[retp : retp+4], 'little', signed=False)
-    assert(rsi2 == 1)
+    assert(rsi2 == 2)
     try:
-      canon_stream_cancel_write(StreamType(U8Type()), True, task, wsi)
+      canon_stream_cancel_write(StreamType(U8Type()), True, thread, wsi)
     except Trap:
       pass
-    [] = canon_stream_drop_writable(StreamType(U8Type()), task, wsi)
+    [] = canon_stream_drop_writable(StreamType(U8Type()), thread, wsi)
     return []
 
   def on_start(): return []
@@ -1605,6 +1611,7 @@ def on_resolve(results): assert(len(results) == 0)
 
 
 def test_host_partial_reads_writes():
+  store = Store()
   mem = bytearray(20)
   opts = mk_opts(memory=mem, sync=False)
 
@@ -1624,56 +1631,56 @@ def host_sink(caller, on_start, on_resolve):
     on_resolve([])
     return mk_done_task(caller)
 
-  def core_func(task, args):
+  def core_func(thread, args):
     assert(len(args) == 0)
     retp = 4
-    [ret] = canon_lower(opts, source_ft, host_source, task, [retp])
+    [ret] = canon_lower(opts, source_ft, host_source, thread, [retp])
     assert(ret == Subtask.State.RETURNED)
     rsi = mem[retp]
-    assert(rsi == 1)
-    [ret] = canon_stream_read(StreamType(U8Type()), opts, task, rsi, 0, 4)
+    assert(rsi == 2)
+    [ret] = canon_stream_read(StreamType(U8Type()), opts, thread, rsi, 0, 4)
     result,n = unpack_result(ret)
     assert(n == 2 and result == CopyResult.COMPLETED)
     assert(mem[0:2] == b'\x01\x02')
-    [ret] = canon_stream_read(StreamType(U8Type()), opts, task, rsi, 0, 4)
+    [ret] = canon_stream_read(StreamType(U8Type()), opts, thread, rsi, 0, 4)
     result,n = unpack_result(ret)
     assert(n == 2 and result == CopyResult.COMPLETED)
     assert(mem[0:2] == b'\x03\x04')
-    [ret] = canon_stream_read(StreamType(U8Type()), opts, task, rsi, 0, 4)
+    [ret] = canon_stream_read(StreamType(U8Type()), opts, thread, rsi, 0, 4)
     assert(ret == definitions.BLOCKED)
     src.write([5,6])
 
-    [seti] = canon_waitable_set_new(task)
-    [] = canon_waitable_join(task, rsi, seti)
-    [event] = canon_waitable_set_wait(True, mem, task, seti, retp)
+    [seti] = canon_waitable_set_new(thread)
+    [] = canon_waitable_join(thread, rsi, seti)
+    [event] = canon_waitable_set_wait(True, mem, thread, seti, retp)
     assert(event == EventCode.STREAM_READ)
     assert(mem[retp+0] == rsi)
     result,n = unpack_result(mem[retp+4])
     assert(n == 2 and result == CopyResult.COMPLETED)
-    [] = canon_stream_drop_readable(StreamType(U8Type()), task, rsi)
+    [] = canon_stream_drop_readable(StreamType(U8Type()), thread, rsi)
 
-    [packed] = canon_stream_new(StreamType(U8Type()), task)
+    [packed] = canon_stream_new(StreamType(U8Type()), thread)
     rsi,wsi = unpack_new_ends(packed)
-    assert(rsi == 1)
-    assert(wsi == 3)
-    [ret] = canon_lower(opts, sink_ft, host_sink, task, [rsi])
+    assert(rsi == 2)
+    assert(wsi == 4)
+    [ret] = canon_lower(opts, sink_ft, host_sink, thread, [rsi])
     assert(ret == Subtask.State.RETURNED)
     mem[0:6] = b'\x01\x02\x03\x04\x05\x06'
-    [ret] = canon_stream_write(StreamType(U8Type()), opts, task, wsi, 0, 6)
+    [ret] = canon_stream_write(StreamType(U8Type()), opts, thread, wsi, 0, 6)
     result,n = unpack_result(ret)
     assert(n == 2 and result == CopyResult.COMPLETED)
-    [ret] = canon_stream_write(StreamType(U8Type()), opts, task, wsi, 2, 4)
+    [ret] = canon_stream_write(StreamType(U8Type()), opts, thread, wsi, 2, 4)
     assert(ret == definitions.BLOCKED)
     dst.set_remain(4)
-    [] = canon_waitable_join(task, wsi, seti)
-    [event] = canon_waitable_set_wait(True, mem, task, seti, retp)
+    [] = canon_waitable_join(thread, wsi, seti)
+    [event] = canon_waitable_set_wait(True, mem, thread, seti, retp)
     assert(event == EventCode.STREAM_WRITE)
     assert(mem[retp+0] == wsi)
     result,n = unpack_result(mem[retp+4])
     assert(n == 4 and result == CopyResult.COMPLETED)
     assert(dst.received == [1,2,3,4,5,6])
-    [] = canon_stream_drop_writable(StreamType(U8Type()), task, wsi)
-    [] = canon_waitable_set_drop(task, seti)
+    [] = canon_stream_drop_writable(StreamType(U8Type()), thread, wsi)
+    [] = canon_waitable_set_drop(thread, seti)
     dst.set_remain(100)
     assert(dst.consume(100) is None)
     return []
@@ -1694,62 +1701,62 @@ def test_wasm_to_wasm_stream():
   mem1 = bytearray(24)
   opts1 = mk_opts(memory=mem1, sync=False)
   ft1 = FuncType([], [StreamType(U8Type())])
-  def core_func1(task, args):
+  def core_func1(thread, args):
     assert(not args)
-    [packed] = canon_stream_new(StreamType(U8Type()), task)
+    [packed] = canon_stream_new(StreamType(U8Type()), thread)
     rsi,wsi = unpack_new_ends(packed)
-    [] = canon_task_return(task, [StreamType(U8Type())], opts1, [rsi])
+    [] = canon_task_return(thread, [StreamType(U8Type())], opts1, [rsi])
 
-    task.thread.suspend_until(fut1.is_set)
+    thread.suspend_until(fut1.is_set)
 
     mem1[0:4] = b'\x01\x02\x03\x04'
-    [ret] = canon_stream_write(StreamType(U8Type()), opts1, task, wsi, 0, 4)
+    [ret] = canon_stream_write(StreamType(U8Type()), opts1, thread, wsi, 0, 4)
     result,n = unpack_result(ret)
     assert(n == 4 and result == CopyResult.COMPLETED)
-    [ret] = canon_stream_write(StreamType(U8Type()), opts1, task, wsi, 0, 4)
+    [ret] = canon_stream_write(StreamType(U8Type()), opts1, thread, wsi, 0, 4)
     result,n = unpack_result(ret)
     assert(n == 4 and result == CopyResult.COMPLETED)
 
-    [ret] = canon_stream_write(StreamType(U8Type()), opts1, task, wsi, 0, 0)
+    [ret] = canon_stream_write(StreamType(U8Type()), opts1, thread, wsi, 0, 0)
     assert(ret == definitions.BLOCKED)
-    [ret] = canon_stream_cancel_write(StreamType(U8Type()), False, task, wsi)
+    [ret] = canon_stream_cancel_write(StreamType(U8Type()), False, thread, wsi)
     result,n = unpack_result(ret)
     assert(n == 0 and result == CopyResult.CANCELLED)
 
-    task.thread.suspend_until(fut2.is_set)
+    thread.suspend_until(fut2.is_set)
 
     mem1[0:8] = b'\x05\x06\x07\x08\x09\x0a\x0b\x0c'
-    [ret] = canon_stream_write(StreamType(U8Type()), opts1, task, wsi, 0, 8)
+    [ret] = canon_stream_write(StreamType(U8Type()), opts1, thread, wsi, 0, 8)
     assert(ret == definitions.BLOCKED)
 
     fut3.set()
 
     retp = 16
-    [seti] = canon_waitable_set_new(task)
-    [] = canon_waitable_join(task, wsi, seti)
-    [event] = canon_waitable_set_wait(True, mem1, task, seti, retp)
+    [seti] = canon_waitable_set_new(thread)
+    [] = canon_waitable_join(thread, wsi, seti)
+    [event] = canon_waitable_set_wait(True, mem1, thread, seti, retp)
     assert(event == EventCode.STREAM_WRITE)
     assert(mem1[retp+0] == wsi)
     result,n = unpack_result(mem1[retp+4])
     assert(n == 4 and result == CopyResult.COMPLETED)
 
-    [ret] = canon_stream_write(StreamType(U8Type()), opts1, task, wsi, 12345, 0)
+    [ret] = canon_stream_write(StreamType(U8Type()), opts1, thread, wsi, 12345, 0)
     assert(ret == definitions.BLOCKED)
 
     fut4.set()
 
-    [event] = canon_waitable_set_wait(True, mem1, task, seti, retp)
+    [event] = canon_waitable_set_wait(True, mem1, thread, seti, retp)
     assert(event == EventCode.STREAM_WRITE)
     assert(mem1[retp+0] == wsi)
     assert(mem1[retp+4] == 0)
 
-    [ret] = canon_stream_write(StreamType(U8Type()), opts1, task, wsi, 12345, 0)
+    [ret] = canon_stream_write(StreamType(U8Type()), opts1, thread, wsi, 12345, 0)
     assert(ret == 0)
 
-    [errctxi] = canon_error_context_new(opts1, task, 0, 0)
-    [] = canon_stream_drop_writable(StreamType(U8Type()), task, wsi)
-    [] = canon_waitable_set_drop(task, seti)
-    [] = canon_error_context_drop(task, errctxi)
+    [errctxi] = canon_error_context_new(opts1, thread, 0, 0)
+    [] = canon_stream_drop_writable(StreamType(U8Type()), thread, wsi)
+    [] = canon_waitable_set_drop(thread, seti)
+    [] = canon_error_context_drop(thread, errctxi)
     return []
 
   func1 = partial(canon_lift, opts1, inst1, ft1, core_func1)
@@ -1759,24 +1766,24 @@ def core_func1(task, args):
   mem2 = heap2.memory
   opts2 = mk_opts(memory=heap2.memory, realloc=heap2.realloc, sync=False)
   ft2 = FuncType([], [])
-  def core_func2(task, args):
+  def core_func2(thread, args):
     assert(not args)
-    [] = canon_task_return(task, [], opts2, [])
+    [] = canon_task_return(thread, [], opts2, [])
 
     retp = 16
-    [ret] = canon_lower(opts2, ft1, func1, task, [retp])
+    [ret] = canon_lower(opts2, ft1, func1, thread, [retp])
     assert(ret == Subtask.State.RETURNED)
     rsi = mem2[retp]
-    assert(rsi == 1)
+    assert(rsi == 2)
 
-    [ret] = canon_stream_read(StreamType(U8Type()), opts2, task, rsi, 0, 8)
+    [ret] = canon_stream_read(StreamType(U8Type()), opts2, thread, rsi, 0, 8)
     assert(ret == definitions.BLOCKED)
 
     fut1.set()
 
-    [seti] = canon_waitable_set_new(task)
-    [] = canon_waitable_join(task, rsi, seti)
-    [event] = canon_waitable_set_wait(True, mem2, task, seti, retp)
+    [seti] = canon_waitable_set_new(thread)
+    [] = canon_waitable_join(thread, rsi, seti)
+    [event] = canon_waitable_set_wait(True, mem2, thread, seti, retp)
     assert(event == EventCode.STREAM_READ)
     assert(mem2[retp+0] == rsi)
     result,n = unpack_result(mem2[retp+4])
@@ -1784,34 +1791,34 @@ def core_func2(task, args):
     assert(mem2[0:8] == b'\x01\x02\x03\x04\x01\x02\x03\x04')
 
     fut2.set()
-    task.thread.suspend_until(fut3.is_set)
+    thread.suspend_until(fut3.is_set)
 
-    [ret] = canon_stream_read(StreamType(U8Type()), opts2, task, rsi, 12345, 0)
+    [ret] = canon_stream_read(StreamType(U8Type()), opts2, thread, rsi, 12345, 0)
     assert(ret == 0)
 
     mem2[0:8] = bytes(8)
-    [ret] = canon_stream_read(StreamType(U8Type()), opts2, task, rsi, 0, 2)
+    [ret] = canon_stream_read(StreamType(U8Type()), opts2, thread, rsi, 0, 2)
     result,n = unpack_result(ret)
     assert(n == 2 and result == CopyResult.COMPLETED)
     assert(mem2[0:6] == b'\x05\x06\x00\x00\x00\x00')
-    [ret] = canon_stream_read(StreamType(U8Type()), opts2, task, rsi, 2, 2)
+    [ret] = canon_stream_read(StreamType(U8Type()), opts2, thread, rsi, 2, 2)
     result,n = unpack_result(ret)
     assert(n == 2 and result == CopyResult.COMPLETED)
     assert(mem2[0:6] == b'\x05\x06\x07\x08\x00\x00')
 
-    task.thread.suspend_until(fut4.is_set)
+    thread.suspend_until(fut4.is_set)
 
-    [ret] = canon_stream_read(StreamType(U8Type()), opts2, task, rsi, 12345, 0)
+    [ret] = canon_stream_read(StreamType(U8Type()), opts2, thread, rsi, 12345, 0)
     assert(ret == definitions.BLOCKED)
 
-    [event] = canon_waitable_set_wait(True, mem2, task, seti, retp)
+    [event] = canon_waitable_set_wait(True, mem2, thread, seti, retp)
     assert(event == EventCode.STREAM_READ)
     assert(mem2[retp+0] == rsi)
     p2 = int.from_bytes(mem2[retp+4 : retp+8], 'little', signed=False)
     assert(p2 == (CopyResult.DROPPED | 1))
 
-    [] = canon_stream_drop_readable(StreamType(U8Type()), task, rsi)
-    [] = canon_waitable_set_drop(task, seti)
+    [] = canon_stream_drop_readable(StreamType(U8Type()), thread, rsi)
+    [] = canon_waitable_set_drop(thread, seti)
     return []
 
   run_lift(opts2, inst2, ft2, core_func2, lambda:[], lambda _:())
@@ -1825,32 +1832,32 @@ def test_wasm_to_wasm_stream_empty():
   mem1 = bytearray(24)
   opts1 = mk_opts(memory=mem1, sync=False)
   ft1 = FuncType([], [StreamType(None)])
-  def core_func1(task, args):
+  def core_func1(thread, args):
     assert(not args)
-    [packed] = canon_stream_new(StreamType(None), task)
+    [packed] = canon_stream_new(StreamType(None), thread)
     rsi,wsi = unpack_new_ends(packed)
-    [] = canon_task_return(task, [StreamType(None)], opts1, [rsi])
+    [] = canon_task_return(thread, [StreamType(None)], opts1, [rsi])
 
-    task.thread.suspend_until(fut1.is_set)
+    thread.suspend_until(fut1.is_set)
 
-    [ret] = canon_stream_write(StreamType(None), opts1, task, wsi, 10000, 2)
+    [ret] = canon_stream_write(StreamType(None), opts1, thread, wsi, 10000, 2)
     result,n = unpack_result(ret)
     assert(n == 2 and result == CopyResult.COMPLETED)
-    [ret] = canon_stream_write(StreamType(None), opts1, task, wsi, 10000, 2)
+    [ret] = canon_stream_write(StreamType(None), opts1, thread, wsi, 10000, 2)
     result,n = unpack_result(ret)
     assert(n == 2 and result == CopyResult.COMPLETED)
 
-    task.thread.suspend_until(fut2.is_set)
+    thread.suspend_until(fut2.is_set)
 
-    [ret] = canon_stream_write(StreamType(None), opts1, task, wsi, 0, 8)
+    [ret] = canon_stream_write(StreamType(None), opts1, thread, wsi, 0, 8)
     assert(ret == definitions.BLOCKED)
 
     fut3.set()
 
     retp = 16
-    [seti] = canon_waitable_set_new(task)
-    [] = canon_waitable_join(task, wsi, seti)
-    [event] = canon_waitable_set_wait(True, mem1, task, seti, retp)
+    [seti] = canon_waitable_set_new(thread)
+    [] = canon_waitable_join(thread, wsi, seti)
+    [event] = canon_waitable_set_wait(True, mem1, thread, seti, retp)
     assert(event == EventCode.STREAM_WRITE)
     assert(mem1[retp+0] == wsi)
     result,n = unpack_result(mem1[retp+4])
@@ -1858,9 +1865,9 @@ def core_func1(task, args):
 
     fut4.set()
 
-    [errctxi] = canon_error_context_new(opts1, task, 0, 0)
-    [] = canon_stream_drop_writable(StreamType(None), task, wsi)
-    [] = canon_error_context_drop(task, errctxi)
+    [errctxi] = canon_error_context_new(opts1, thread, 0, 0)
+    [] = canon_stream_drop_writable(StreamType(None), thread, wsi)
+    [] = canon_error_context_drop(thread, errctxi)
     return []
 
   func1 = partial(canon_lift, opts1, inst1, ft1, core_func1)
@@ -1870,45 +1877,45 @@ def core_func1(task, args):
   mem2 = heap2.memory
   opts2 = mk_opts(memory=heap2.memory, realloc=heap2.realloc, sync=False)
   ft2 = FuncType([], [])
-  def core_func2(task, args):
+  def core_func2(thread, args):
     assert(not args)
-    [] = canon_task_return(task, [], opts2, [])
+    [] = canon_task_return(thread, [], opts2, [])
 
     retp = 0
-    [ret] = canon_lower(opts2, ft1, func1, task, [retp])
+    [ret] = canon_lower(opts2, ft1, func1, thread, [retp])
     assert(ret == Subtask.State.RETURNED)
     rsi = mem2[0]
-    assert(rsi == 1)
+    assert(rsi == 2)
 
-    [ret] = canon_stream_read(StreamType(None), opts2, task, rsi, 0, 8)
+    [ret] = canon_stream_read(StreamType(None), opts2, thread, rsi, 0, 8)
     assert(ret == definitions.BLOCKED)
 
     fut1.set()
 
-    [seti] = canon_waitable_set_new(task)
-    [] = canon_waitable_join(task, rsi, seti)
-    [event] = canon_waitable_set_wait(True, mem2, task, seti, retp)
+    [seti] = canon_waitable_set_new(thread)
+    [] = canon_waitable_join(thread, rsi, seti)
+    [event] = canon_waitable_set_wait(True, mem2, thread, seti, retp)
     assert(event == EventCode.STREAM_READ)
     assert(mem2[retp+0] == rsi)
     result,n = unpack_result(mem2[retp+4])
     assert(n == 4 and result == CopyResult.COMPLETED)
 
     fut2.set()
-    task.thread.suspend_until(fut3.is_set)
+    thread.suspend_until(fut3.is_set)
 
-    [ret] = canon_stream_read(StreamType(None), opts2, task, rsi, 1000000, 2)
+    [ret] = canon_stream_read(StreamType(None), opts2, thread, rsi, 1000000, 2)
     result,n = unpack_result(ret)
     assert(n == 2 and result == CopyResult.COMPLETED)
-    [ret] = canon_stream_read(StreamType(None), opts2, task, rsi, 1000000, 2)
+    [ret] = canon_stream_read(StreamType(None), opts2, thread, rsi, 1000000, 2)
     result,n = unpack_result(ret)
     assert(n == 2 and result == CopyResult.COMPLETED)
 
-    task.thread.suspend_until(fut4.is_set)
+    thread.suspend_until(fut4.is_set)
 
-    [ret] = canon_stream_read(StreamType(None), opts2, task, rsi, 1000000, 2)
+    [ret] = canon_stream_read(StreamType(None), opts2, thread, rsi, 1000000, 2)
     result,n = unpack_result(ret)
     assert(n == 0 and result == CopyResult.DROPPED)
-    [] = canon_stream_drop_readable(StreamType(None), task, rsi)
+    [] = canon_stream_drop_readable(StreamType(None), thread, rsi)
     return []
 
   run_lift(opts2, inst2, ft2, core_func2, lambda:[], lambda _:())
@@ -1939,74 +1946,74 @@ def host_func2(caller, on_start, on_resolve):
     return mk_done_task(caller)
 
   lift_opts = mk_opts()
-  def core_func(task, args):
+  def core_func(thread, args):
     assert(not args)
 
-    [packed] = canon_stream_new(StreamType(U8Type()), task)
+    [packed] = canon_stream_new(StreamType(U8Type()), thread)
     rsi,wsi = unpack_new_ends(packed)
-    [ret] = canon_lower(lower_opts, host_ft1, host_func1, task, [rsi])
+    [ret] = canon_lower(lower_opts, host_ft1, host_func1, thread, [rsi])
     assert(ret == Subtask.State.RETURNED)
     mem[0:4] = b'\x0a\x0b\x0c\x0d'
-    [ret] = canon_stream_write(StreamType(U8Type()), lower_opts, task, wsi, 0, 4)
+    [ret] = canon_stream_write(StreamType(U8Type()), lower_opts, thread, wsi, 0, 4)
     assert(ret == definitions.BLOCKED)
     host_sink.set_remain(2)
     got = host_sink.consume(2)
     assert(got == [0xa, 0xb])
-    [ret] = canon_stream_cancel_write(StreamType(U8Type()), True, task, wsi)
+    [ret] = canon_stream_cancel_write(StreamType(U8Type()), True, thread, wsi)
     result,n = unpack_result(ret)
     assert(n == 2 and result == CopyResult.COMPLETED)
-    [] = canon_stream_drop_writable(StreamType(U8Type()), task, wsi)
+    [] = canon_stream_drop_writable(StreamType(U8Type()), thread, wsi)
     host_sink.set_remain(100)
     assert(host_sink.consume(100) is None)
 
-    [packed] = canon_stream_new(StreamType(U8Type()), task)
+    [packed] = canon_stream_new(StreamType(U8Type()), thread)
     rsi,wsi = unpack_new_ends(packed)
-    [ret] = canon_lower(lower_opts, host_ft1, host_func1, task, [rsi])
+    [ret] = canon_lower(lower_opts, host_ft1, host_func1, thread, [rsi])
     assert(ret == Subtask.State.RETURNED)
     mem[0:4] = b'\x01\x02\x03\x04'
-    [ret] = canon_stream_write(StreamType(U8Type()), lower_opts, task, wsi, 0, 4)
+    [ret] = canon_stream_write(StreamType(U8Type()), lower_opts, thread, wsi, 0, 4)
     assert(ret == definitions.BLOCKED)
     host_sink.set_remain(2)
     got = host_sink.consume(2)
     assert(got == [1, 2])
-    [ret] = canon_stream_cancel_write(StreamType(U8Type()), False, task, wsi)
+    [ret] = canon_stream_cancel_write(StreamType(U8Type()), False, thread, wsi)
     result,n = unpack_result(ret)
     assert(n == 2 and result == CopyResult.COMPLETED)
-    [] = canon_stream_drop_writable(StreamType(U8Type()), task, wsi)
+    [] = canon_stream_drop_writable(StreamType(U8Type()), thread, wsi)
     host_sink.set_remain(100)
     assert(host_sink.consume(100) is None)
 
     retp = 16
-    [ret] = canon_lower(lower_opts, host_ft2, host_func2, task, [retp])
+    [ret] = canon_lower(lower_opts, host_ft2, host_func2, thread, [retp])
     assert(ret == Subtask.State.RETURNED)
     rsi = mem[retp]
-    [ret] = canon_stream_read(StreamType(U8Type()), lower_opts, task, rsi, 0, 4)
+    [ret] = canon_stream_read(StreamType(U8Type()), lower_opts, thread, rsi, 0, 4)
     assert(ret == definitions.BLOCKED)
-    [ret] = canon_stream_cancel_read(StreamType(U8Type()), True, task, rsi)
+    [ret] = canon_stream_cancel_read(StreamType(U8Type()), True, thread, rsi)
     result,n = unpack_result(ret)
     assert(n == 0 and result == CopyResult.CANCELLED)
-    [] = canon_stream_drop_readable(StreamType(U8Type()), task, rsi)
+    [] = canon_stream_drop_readable(StreamType(U8Type()), thread, rsi)
 
-    [ret] = canon_lower(lower_opts, host_ft2, host_func2, task, [retp])
+    [ret] = canon_lower(lower_opts, host_ft2, host_func2, thread, [retp])
     assert(ret == Subtask.State.RETURNED)
     rsi = mem[retp]
-    [ret] = canon_stream_read(StreamType(U8Type()), lower_opts, task, rsi, 0, 4)
+    [ret] = canon_stream_read(StreamType(U8Type()), lower_opts, thread, rsi, 0, 4)
     assert(ret == definitions.BLOCKED)
     host_source.block_cancel()
-    [ret] = canon_stream_cancel_read(StreamType(U8Type()), False, task, rsi)
+    [ret] = canon_stream_cancel_read(StreamType(U8Type()), False, thread, rsi)
     assert(ret == definitions.BLOCKED)
     host_source.write([7,8])
     host_source.unblock_cancel()
-    [seti] = canon_waitable_set_new(task)
-    [] = canon_waitable_join(task, rsi, seti)
-    [event] = canon_waitable_set_wait(True, mem, task, seti, retp)
+    [seti] = canon_waitable_set_new(thread)
+    [] = canon_waitable_join(thread, rsi, seti)
+    [event] = canon_waitable_set_wait(True, mem, thread, seti, retp)
     assert(event == EventCode.STREAM_READ)
     assert(mem[retp+0] == rsi)
     result,n = unpack_result(mem[retp+4])
     assert(n == 2 and result == CopyResult.CANCELLED)
     assert(mem[0:2] == b'\x07\x08')
-    [] = canon_stream_drop_readable(StreamType(U8Type()), task, rsi)
-    [] = canon_waitable_set_drop(task, seti)
+    [] = canon_stream_drop_readable(StreamType(U8Type()), thread, rsi)
+    [] = canon_waitable_set_drop(thread, seti)
 
     return []
 
@@ -2071,78 +2078,78 @@ def test_futures():
 
   host_ft1 = FuncType([FutureType(U8Type())],[FutureType(U8Type())])
   def host_func(caller, on_start, on_resolve):
-    def thread_func(task):
+    def thread_func(thread):
       [future] = on_start()
       outgoing = HostFutureSource(U8Type())
-      task.return_([outgoing])
+      thread.task.return_([outgoing])
       incoming = HostFutureSink(U8Type())
       future.read(None, incoming, lambda why:())
-      task.thread.suspend_until(incoming.has_v.is_set)
+      thread.suspend_until(incoming.has_v.is_set)
       assert(incoming.v == 42)
       outgoing.set_result(43)
     return mk_task(caller, on_resolve, thread_func)
 
   lift_opts = mk_opts()
-  def core_func(task, args):
+  def core_func(thread, args):
     assert(not args)
-    [packed] = canon_future_new(FutureType(U8Type()), task)
+    [packed] = canon_future_new(FutureType(U8Type()), thread)
     rfi,wfi = unpack_new_ends(packed)
     retp = 16
-    [ret] = canon_lower(lower_opts, host_ft1, host_func, task, [rfi, retp])
+    [ret] = canon_lower(lower_opts, host_ft1, host_func, thread, [rfi, retp])
     assert(ret == Subtask.State.RETURNED)
     rfi = mem[retp]
 
     readp = 0
-    [ret] = canon_future_read(FutureType(U8Type()), lower_opts, task, rfi, readp)
+    [ret] = canon_future_read(FutureType(U8Type()), lower_opts, thread, rfi, readp)
     assert(ret == definitions.BLOCKED)
 
     writep = 8
     mem[writep] = 42
-    [ret] = canon_future_write(FutureType(U8Type()), lower_opts, task, wfi, writep)
+    [ret] = canon_future_write(FutureType(U8Type()), lower_opts, thread, wfi, writep)
     assert(ret == CopyResult.COMPLETED)
 
-    [seti] = canon_waitable_set_new(task)
-    [] = canon_waitable_join(task, rfi, seti)
-    [event] = canon_waitable_set_wait(True, mem, task, seti, retp)
+    [seti] = canon_waitable_set_new(thread)
+    [] = canon_waitable_join(thread, rfi, seti)
+    [event] = canon_waitable_set_wait(True, mem, thread, seti, retp)
     assert(event == EventCode.FUTURE_READ)
     assert(mem[retp+0] == rfi)
     assert(mem[retp+4] == CopyResult.COMPLETED)
     assert(mem[readp] == 43)
 
-    [] = canon_future_drop_writable(FutureType(U8Type()), task, wfi)
-    [] = canon_future_drop_readable(FutureType(U8Type()), task, rfi)
-    [] = canon_waitable_set_drop(task, seti)
+    [] = canon_future_drop_writable(FutureType(U8Type()), thread, wfi)
+    [] = canon_future_drop_readable(FutureType(U8Type()), thread, rfi)
+    [] = canon_waitable_set_drop(thread, seti)
 
-    [packed] = canon_future_new(FutureType(U8Type()), task)
+    [packed] = canon_future_new(FutureType(U8Type()), thread)
     rfi,wfi = unpack_new_ends(packed)
-    [ret] = canon_lower(lower_opts, host_ft1, host_func, task, [rfi, retp])
+    [ret] = canon_lower(lower_opts, host_ft1, host_func, thread, [rfi, retp])
     assert(ret == Subtask.State.RETURNED)
     rfi = mem[retp]
 
     readp = 0
-    [ret] = canon_future_read(FutureType(U8Type()), lower_opts, task, rfi, readp)
+    [ret] = canon_future_read(FutureType(U8Type()), lower_opts, thread, rfi, readp)
     assert(ret == definitions.BLOCKED)
 
     writep = 8
     mem[writep] = 42
-    [ret] = canon_future_write(FutureType(U8Type()), lower_opts, task, wfi, writep)
+    [ret] = canon_future_write(FutureType(U8Type()), lower_opts, thread, wfi, writep)
     assert(ret == CopyResult.COMPLETED)
 
-    while not task.inst.table.get(rfi).has_pending_event():
-      canon_yield(True, task)
+    while not thread.task.inst.table.get(rfi).has_pending_event():
+      canon_thread_yield(True, thread)
 
-    [ret] = canon_future_cancel_read(FutureType(U8Type()), True, task, rfi)
+    [ret] = canon_future_cancel_read(FutureType(U8Type()), True, thread, rfi)
     assert(ret == CopyResult.COMPLETED)
     assert(mem[readp] == 43)
 
-    [] = canon_future_drop_writable(FutureType(U8Type()), task, wfi)
-    [] = canon_future_drop_readable(FutureType(U8Type()), task, rfi)
+    [] = canon_future_drop_writable(FutureType(U8Type()), thread, wfi)
+    [] = canon_future_drop_readable(FutureType(U8Type()), thread, rfi)
 
-    [packed] = canon_future_new(FutureType(U8Type()), task)
+    [packed] = canon_future_new(FutureType(U8Type()), thread)
     rfi,wfi = unpack_new_ends(packed)
     trapped = False
     try:
-      canon_future_drop_writable(FutureType(U8Type()), task, wfi)
+      canon_future_drop_writable(FutureType(U8Type()), thread, wfi)
     except Trap:
       trapped = True
     assert(trapped)
@@ -2153,246 +2160,306 @@ def core_func(task, args):
 
 
 def test_cancel_subtask():
+  store = Store()
   ft = FuncType([U8Type()], [U8Type()])
 
   callee_heap = Heap(10)
   callee_opts = mk_opts(callee_heap.memory, sync = False)
   sync_callee_opts = mk_opts(callee_heap.memory, sync = True)
-  store = Store()
   callee_inst = ComponentInstance(store)
 
-  def core_callee1(task, args):
+  def core_callee1(thread, args):
     assert(False)
   callee1 = partial(canon_lift, callee_opts, callee_inst, ft, core_callee1)
 
-  def core_callee2(task, args):
+  def core_callee2(thread, args):
     [x] = args
-    [si] = canon_waitable_set_new(task)
-    [ret] = canon_waitable_set_wait(True, callee_heap.memory, task, si, 0)
+    [si] = canon_waitable_set_new(thread)
+    [ret] = canon_waitable_set_wait(True, callee_heap.memory, thread, si, 0)
     assert(ret == EventCode.TASK_CANCELLED)
     match x:
       case 1:
-        [] = canon_task_return(task, [U8Type()], callee_opts, [42])
+        [] = canon_task_return(thread, [U8Type()], callee_opts, [42])
       case 2:
-        [] = canon_task_cancel(task)
+        [] = canon_task_cancel(thread)
       case 3:
-        [_] = canon_yield(True, task)
-        [] = canon_task_return(task, [U8Type()], callee_opts, [43])
+        [_] = canon_thread_yield(True, thread)
+        [] = canon_task_return(thread, [U8Type()], callee_opts, [43])
       case 4:
-        [_] = canon_yield(True, task)
-        [] = canon_task_cancel(task)
+        [_] = canon_thread_yield(True, thread)
+        [] = canon_task_cancel(thread)
       case _:
         assert(False)
     return []
   callee2 = partial(canon_lift, callee_opts, callee_inst, ft, core_callee2)
 
-  def core_callee3(task, args):
+  def core_callee3(thread, args):
     [x] = args
-    [cancelled] = canon_yield(True, task)
+    [cancelled] = canon_thread_yield(True, thread)
     if cancelled:
-      [] = canon_task_cancel(task)
+      [] = canon_task_cancel(thread)
     else:
-      [] = canon_task_return(task, [U8Type()], callee_opts, [83])
+      [] = canon_task_return(thread, [U8Type()], callee_opts, [83])
     return []
   callee3 = partial(canon_lift, callee_opts, callee_inst, ft, core_callee3)
 
   host_fut4 = RacyBool(False)
   def host_import4(caller, on_start, on_resolve):
-    def thread_func(task):
+    def thread_func(thread):
       args = on_start()
       assert(len(args) == 1)
       assert(args[0] == 42)
-      task.thread.suspend_until(host_fut4.is_set)
-      task.return_([43])
+      thread.suspend_until(host_fut4.is_set)
+      thread.task.return_([43])
     return mk_task(caller, on_resolve, thread_func)
-  def core_callee4(task, args):
+  def core_callee4(thread, args):
     [x] = args
-    [result] = canon_lower(sync_callee_opts, ft, host_import4, task, [42])
+    [result] = canon_lower(sync_callee_opts, ft, host_import4, thread, [42])
     assert(result == 43)
     try:
-      [] = canon_task_cancel(task)
+      [] = canon_task_cancel(thread)
       assert(False)
     except Trap:
       pass
-    [seti] = canon_waitable_set_new(task)
-    [result] = canon_waitable_set_wait(True, callee_heap.memory, task, seti, 0)
+    [seti] = canon_waitable_set_new(thread)
+    [result] = canon_waitable_set_wait(True, callee_heap.memory, thread, seti, 0)
     assert(result == EventCode.TASK_CANCELLED)
-    [result] = canon_waitable_set_poll(True, callee_heap.memory, task, seti, 0)
+    [result] = canon_waitable_set_poll(True, callee_heap.memory, thread, seti, 0)
     assert(result == EventCode.NONE)
-    [] = canon_task_cancel(task)
+    [] = canon_task_cancel(thread)
     return []
   callee4 = partial(canon_lift, callee_opts, callee_inst, ft, core_callee4)
 
   host_fut5 = RacyBool(False)
   def host_import5(caller, on_start, on_resolve):
-    def thread_func(task):
+    def thread_func(thread):
       args = on_start()
       assert(len(args) == 1)
       assert(args[0] == 42)
-      task.thread.suspend_until(host_fut5.is_set)
-      assert(task.state == Task.State.PENDING_CANCEL)
-      task.thread.suspend_until(host_fut5.is_set)
-      task.return_([43])
+      thread.suspend_until(host_fut5.is_set)
+      assert(thread.task.state == Task.State.PENDING_CANCEL)
+      thread.suspend_until(host_fut5.is_set)
+      thread.task.return_([43])
     return mk_task(caller, on_resolve, thread_func)
-  def core_callee5(task, args):
+  def core_callee5(thread, args):
     [x] = args
-    [ret] = canon_lower(callee_opts, ft, host_import5, task, [42, 0])
+    assert(x == 13)
+    [ret] = canon_lower(callee_opts, ft, host_import5, thread, [42, 0])
     state,subi = unpack_result(ret)
     assert(state == Subtask.State.STARTED)
-    [ret] = canon_subtask_cancel(True, task, subi)
+    [ret] = canon_subtask_cancel(True, thread, subi)
     assert(ret == Subtask.State.RETURNED)
-    [] = canon_task_return(task, [U8Type()], callee_opts, [44])
+    [] = canon_task_return(thread, [U8Type()], callee_opts, [44])
     return []
   callee5 = partial(canon_lift, callee_opts, callee_inst, ft, core_callee5)
 
+  core_ftbl = Table()
+  core_ft = CoreFuncType(['i32'], [])
+  def thread_func(cancellable, thread, args):
+    [mainthreadi] = args
+    if cancellable:
+      [ret] = canon_thread_switch_to(True, thread, mainthreadi)
+      assert(ret == SuspendResult.CANCELLED)
+      [ret] = canon_thread_switch_to(True, thread, mainthreadi)
+      assert(ret == SuspendResult.COMPLETED)
+      [] = canon_task_return(thread, [U8Type()], callee_opts, [45])
+    else:
+      [ret] = canon_thread_switch_to(False, thread, mainthreadi)
+      assert(ret == SuspendResult.COMPLETED)
+    return []
+  cthread_func = partial(thread_func, True)
+  ncthread_func = partial(thread_func, False)
+  cfi = core_ftbl.add(CoreFuncRef(core_ft, cthread_func))
+  ncfi = core_ftbl.add(CoreFuncRef(core_ft, ncthread_func))
+
+  def core_callee6(thread, args):
+    [x] = args
+    assert(x == 14)
+
+    [mainthreadi] = canon_thread_index(thread)
+
+    [threadi1] = canon_thread_new_indirect(core_ft, core_ftbl, thread, ncfi, mainthreadi)
+    [ret] = canon_thread_switch_to(True, thread, threadi1)
+    assert(ret == SuspendResult.COMPLETED)
+
+    [threadi2] = canon_thread_new_indirect(core_ft, core_ftbl, thread, cfi, mainthreadi)
+    [ret] = canon_thread_switch_to(True, thread, threadi2)
+    assert(ret == SuspendResult.COMPLETED)
+
+    [threadi3] = canon_thread_new_indirect(core_ft, core_ftbl, thread, ncfi, mainthreadi)
+    [ret] = canon_thread_switch_to(True, thread, threadi3)
+    assert(ret == SuspendResult.COMPLETED)
+
+    [ret] = canon_thread_suspend(False, thread)
+    assert(ret == SuspendResult.COMPLETED)
+
+    [] = canon_thread_resume_later(thread, threadi1)
+    [] = canon_thread_resume_later(thread, threadi2)
+    [] = canon_thread_resume_later(thread, threadi3)
+    return []
+  callee6 = partial(canon_lift, callee_opts, callee_inst, ft, core_callee6)
+
   caller_heap = Heap(20)
   caller_opts = mk_opts(caller_heap.memory, sync = False)
   caller_inst = ComponentInstance(store)
 
-  def core_caller(task, args):
+  def core_caller(thread, args):
     [x] = args
     assert(x == 1)
 
-    [seti] = canon_waitable_set_new(task)
+    [seti] = canon_waitable_set_new(thread)
 
     callee_inst.backpressure = True
-    [ret] = canon_lower(caller_opts, ft, callee1, task, [13, 0])
+    [ret] = canon_lower(caller_opts, ft, callee1, thread, [13, 0])
     state,subi1 = unpack_result(ret)
     assert(state == Subtask.State.STARTING)
-    [ret] = canon_lower(caller_opts, ft, callee1, task, [13, 0])
+    [ret] = canon_lower(caller_opts, ft, callee1, thread, [13, 0])
     state,subi2 = unpack_result(ret)
     assert(state == Subtask.State.STARTING)
-    [ret] = canon_subtask_cancel(True, task, subi2)
+    [ret] = canon_subtask_cancel(True, thread, subi2)
     assert(ret == Subtask.State.CANCELLED_BEFORE_STARTED)
-    [ret] = canon_subtask_cancel(False, task, subi1)
+    [ret] = canon_subtask_cancel(False, thread, subi1)
     assert(ret == Subtask.State.CANCELLED_BEFORE_STARTED)
     callee_inst.backpressure = False
 
-    [ret] = canon_lower(caller_opts, ft, callee2, task, [1, 0])
+    [ret] = canon_lower(caller_opts, ft, callee2, thread, [1, 0])
     state,subi1 = unpack_result(ret)
     assert(state == Subtask.State.STARTED)
-    [ret] = canon_lower(caller_opts, ft, callee2, task, [2, 0])
+    [ret] = canon_lower(caller_opts, ft, callee2, thread, [2, 0])
     state,subi2 = unpack_result(ret)
     assert(state == Subtask.State.STARTED)
-    [ret] = canon_lower(caller_opts, ft, callee2, task, [3, 0])
+    [ret] = canon_lower(caller_opts, ft, callee2, thread, [3, 0])
     state,subi3 = unpack_result(ret)
     assert(state == Subtask.State.STARTED)
-    [ret] = canon_lower(caller_opts, ft, callee2, task, [3, 0])
+    [ret] = canon_lower(caller_opts, ft, callee2, thread, [3, 0])
     state,subi3_2 = unpack_result(ret)
     assert(state == Subtask.State.STARTED)
-    [ret] = canon_lower(caller_opts, ft, callee2, task, [4, 0])
+    [ret] = canon_lower(caller_opts, ft, callee2, thread, [4, 0])
     state,subi4 = unpack_result(ret)
     assert(state == Subtask.State.STARTED)
-    [ret] = canon_lower(caller_opts, ft, callee2, task, [4, 0])
+    [ret] = canon_lower(caller_opts, ft, callee2, thread, [4, 0])
     state,subi4_2 = unpack_result(ret)
     assert(state == Subtask.State.STARTED)
 
     caller_heap.memory[0] = 13
-    [ret] = canon_subtask_cancel(True, task, subi1)
+    [ret] = canon_subtask_cancel(True, thread, subi1)
     assert(ret == Subtask.State.RETURNED)
     assert(caller_heap.memory[0] == 42)
-    [] = canon_subtask_drop(task, subi1)
+    [] = canon_subtask_drop(thread, subi1)
 
     caller_heap.memory[0] = 13
-    [ret] = canon_subtask_cancel(False, task, subi2)
+    [ret] = canon_subtask_cancel(False, thread, subi2)
     assert(ret == Subtask.State.CANCELLED_BEFORE_RETURNED)
     assert(caller_heap.memory[0] == 13)
-    [] = canon_subtask_drop(task, subi2)
+    [] = canon_subtask_drop(thread, subi2)
 
     caller_heap.memory[0] = 13
-    [ret] = canon_subtask_cancel(False, task, subi3)
+    [ret] = canon_subtask_cancel(False, thread, subi3)
     assert(ret == definitions.BLOCKED)
     assert(caller_heap.memory[0] == 13)
-    [] = canon_waitable_join(task, subi3, seti)
+    [] = canon_waitable_join(thread, subi3, seti)
     retp = 8
-    [ret] = canon_waitable_set_wait(True, caller_heap.memory, task, seti, retp)
+    [ret] = canon_waitable_set_wait(True, caller_heap.memory, thread, seti, retp)
     assert(ret == EventCode.SUBTASK)
     assert(caller_heap.memory[retp+0] == subi3)
     assert(caller_heap.memory[retp+4] == Subtask.State.RETURNED)
     assert(caller_heap.memory[0] == 43)
-    [] = canon_subtask_drop(task, subi3)
+    [] = canon_subtask_drop(thread, subi3)
 
     caller_heap.memory[0] = 13
-    [ret] = canon_subtask_cancel(True, task, subi3_2)
+    [ret] = canon_subtask_cancel(True, thread, subi3_2)
     assert(ret == Subtask.State.RETURNED)
     assert(caller_heap.memory[0] == 43)
-    [] = canon_subtask_drop(task, subi3_2)
+    [] = canon_subtask_drop(thread, subi3_2)
 
     caller_heap.memory[0] = 13
-    [ret] = canon_subtask_cancel(False, task, subi4)
+    [ret] = canon_subtask_cancel(False, thread, subi4)
     assert(ret == definitions.BLOCKED)
     assert(caller_heap.memory[0] == 13)
-    [] = canon_waitable_join(task, subi4, seti)
+    [] = canon_waitable_join(thread, subi4, seti)
     retp = 8
-    [ret] = canon_waitable_set_wait(True, caller_heap.memory, task, seti, retp)
+    [ret] = canon_waitable_set_wait(True, caller_heap.memory, thread, seti, retp)
     assert(ret == EventCode.SUBTASK)
     assert(caller_heap.memory[retp+0] == subi4)
     assert(caller_heap.memory[retp+4] == Subtask.State.CANCELLED_BEFORE_RETURNED)
-    [] = canon_subtask_drop(task, subi4)
+    [] = canon_subtask_drop(thread, subi4)
 
     caller_heap.memory[0] = 13
-    [ret] = canon_subtask_cancel(True, task, subi4_2)
+    [ret] = canon_subtask_cancel(True, thread, subi4_2)
     assert(ret == Subtask.State.CANCELLED_BEFORE_RETURNED)
     assert(caller_heap.memory[0] == 13)
-    [] = canon_subtask_drop(task, subi4_2)
+    [] = canon_subtask_drop(thread, subi4_2)
 
     caller_heap.memory[0] = 13
-    [ret] = canon_lower(caller_opts, ft, callee3, task, [0, 0])
+    [ret] = canon_lower(caller_opts, ft, callee3, thread, [0, 0])
     state,subi = unpack_result(ret)
     assert(state == Subtask.State.STARTED)
     while caller_inst.table.get(subi).state == Subtask.State.STARTED:
-      [_] = canon_yield(True, task)
-    [ret] = canon_subtask_cancel(False, task, subi)
+      [_] = canon_thread_yield(True, thread)
+    [ret] = canon_subtask_cancel(False, thread, subi)
     assert(ret == Subtask.State.RETURNED)
     assert(caller_heap.memory[0] == 83)
-    [] = canon_subtask_drop(task, subi)
+    [] = canon_subtask_drop(thread, subi)
 
     caller_heap.memory[0] = 13
-    [ret] = canon_lower(caller_opts, ft, callee3, task, [0, 0])
+    [ret] = canon_lower(caller_opts, ft, callee3, thread, [0, 0])
     state,subi = unpack_result(ret)
     assert(state == Subtask.State.STARTED)
-    [ret] = canon_subtask_cancel(False, task, subi)
+    [ret] = canon_subtask_cancel(False, thread, subi)
     assert(ret == Subtask.State.CANCELLED_BEFORE_RETURNED)
     assert(caller_heap.memory[0] == 13)
-    [] = canon_subtask_drop(task, subi)
+    [] = canon_subtask_drop(thread, subi)
 
     caller_heap.memory[0] = 13
-    [ret] = canon_lower(caller_opts, ft, callee4, task, [0, 0])
+    [ret] = canon_lower(caller_opts, ft, callee4, thread, [0, 0])
     state,subi = unpack_result(ret)
     assert(state == Subtask.State.STARTED)
-    [ret] = canon_subtask_cancel(False, task, subi)
+    [ret] = canon_subtask_cancel(False, thread, subi)
     assert(ret == definitions.BLOCKED)
     assert(caller_heap.memory[0] == 13)
     host_fut4.set()
-    [] = canon_waitable_join(task, subi, seti)
+    [] = canon_waitable_join(thread, subi, seti)
     waitretp = 4
-    [event] = canon_waitable_set_wait(True, caller_heap.memory, task, seti, waitretp)
+    [event] = canon_waitable_set_wait(True, caller_heap.memory, thread, seti, waitretp)
     assert(event == EventCode.SUBTASK)
     assert(caller_heap.memory[waitretp] == subi)
     assert(caller_heap.memory[waitretp+4] == Subtask.State.CANCELLED_BEFORE_RETURNED)
     assert(caller_heap.memory[0] == 13)
-    [] = canon_subtask_drop(task, subi)
+    [] = canon_subtask_drop(thread, subi)
 
-    caller_heap.memory[0] = 13
-    [ret] = canon_lower(caller_opts, ft, callee5, task, [0, 0])
+    [ret] = canon_lower(caller_opts, ft, callee5, thread, [13, 0])
     state,subi = unpack_result(ret)
     assert(state == Subtask.State.STARTED)
-    [ret] = canon_subtask_cancel(False, task, subi)
+    [ret] = canon_subtask_cancel(False, thread, subi)
     assert(ret == definitions.BLOCKED)
     assert(caller_heap.memory[0] == 13)
     host_fut5.set()
-    [] = canon_waitable_join(task, subi, seti)
+    [] = canon_waitable_join(thread, subi, seti)
     waitretp = 4
-    [event] = canon_waitable_set_wait(True, caller_heap.memory, task, seti, waitretp)
+    [event] = canon_waitable_set_wait(True, caller_heap.memory, thread, seti, waitretp)
     assert(event == EventCode.SUBTASK)
     assert(caller_heap.memory[waitretp] == subi)
     assert(caller_heap.memory[waitretp+4] == Subtask.State.RETURNED)
     assert(caller_heap.memory[0] == 44)
-    [] = canon_subtask_drop(task, subi)
+    [] = canon_subtask_drop(thread, subi)
 
-    [] = canon_waitable_set_drop(task, seti)
-    [] = canon_task_return(task, [U8Type()], caller_opts, [42])
+    [ret] = canon_lower(caller_opts, ft, callee6, thread, [14, 0])
+    state,subi = unpack_result(ret)
+    assert(state == Subtask.State.STARTED)
+
+    [ret] = canon_subtask_cancel(False, thread, subi)
+    assert(ret == definitions.BLOCKED)
 
+    [] = canon_waitable_join(thread, subi, seti)
+    [event] = canon_waitable_set_wait(True, caller_heap.memory, thread, seti, 4)
+    assert(event == EventCode.SUBTASK)
+    assert(caller_heap.memory[0] == 45)
+    assert(caller_heap.memory[4] == subi)
+    assert(caller_heap.memory[8] == Subtask.State.RETURNED)
+    [] = canon_subtask_drop(thread, subi)
+
+    [] = canon_waitable_set_drop(thread, seti)
+    [] = canon_task_return(thread, [U8Type()], caller_opts, [42])
     return []
 
   def on_start():
@@ -2417,49 +2484,49 @@ def test_self_empty():
   async_opts = mk_opts(memory=mem, sync=False)
 
   ft = FuncType([],[])
-  def core_func(task, args):
-    [seti] = canon_waitable_set_new(task)
+  def core_func(thread, args):
+    [seti] = canon_waitable_set_new(thread)
 
-    [packed] = canon_future_new(FutureType(None), task)
+    [packed] = canon_future_new(FutureType(None), thread)
     rfi,wfi = unpack_new_ends(packed)
 
-    [ret] = canon_future_write(FutureType(None), async_opts, task, wfi, 0xdeadbeef)
+    [ret] = canon_future_write(FutureType(None), async_opts, thread, wfi, 0xdeadbeef)
     assert(ret == definitions.BLOCKED)
 
-    [ret] = canon_future_read(FutureType(None), async_opts, task, rfi, 0xdeadbeef)
+    [ret] = canon_future_read(FutureType(None), async_opts, thread, rfi, 0xdeadbeef)
     assert(ret == CopyResult.COMPLETED)
-    [] = canon_future_drop_readable(FutureType(None), task, rfi)
+    [] = canon_future_drop_readable(FutureType(None), thread, rfi)
 
-    [] = canon_waitable_join(task, wfi, seti)
-    [event] = canon_waitable_set_wait(True, mem, task, seti, 0)
+    [] = canon_waitable_join(thread, wfi, seti)
+    [event] = canon_waitable_set_wait(True, mem, thread, seti, 0)
     assert(event == EventCode.FUTURE_WRITE)
     assert(mem[0] == wfi)
     assert(mem[4] == CopyResult.COMPLETED)
-    [] = canon_future_drop_writable(FutureType(None), task, wfi)
+    [] = canon_future_drop_writable(FutureType(None), thread, wfi)
 
-    [packed] = canon_stream_new(StreamType(None), task)
+    [packed] = canon_stream_new(StreamType(None), thread)
     rsi,wsi = unpack_new_ends(packed)
-    [ret] = canon_stream_write(StreamType(None), async_opts, task, wsi, 10000, 3)
+    [ret] = canon_stream_write(StreamType(None), async_opts, thread, wsi, 10000, 3)
     assert(ret == definitions.BLOCKED)
 
-    [ret] = canon_stream_read(StreamType(None), async_opts, task, rsi, 2000, 1)
+    [ret] = canon_stream_read(StreamType(None), async_opts, thread, rsi, 2000, 1)
     result,n = unpack_result(ret)
     assert(n == 1 and result == CopyResult.COMPLETED)
-    [ret] = canon_stream_read(StreamType(None), async_opts, task, rsi, 2000, 4)
+    [ret] = canon_stream_read(StreamType(None), async_opts, thread, rsi, 2000, 4)
     result,n = unpack_result(ret)
     assert(n == 2 and result == CopyResult.COMPLETED)
-    [] = canon_stream_drop_readable(StreamType(None), task, rsi)
+    [] = canon_stream_drop_readable(StreamType(None), thread, rsi)
 
-    [] = canon_waitable_join(task, wsi, seti)
-    [event] = canon_waitable_set_wait(True, mem, task, seti, 0)
+    [] = canon_waitable_join(thread, wsi, seti)
+    [event] = canon_waitable_set_wait(True, mem, thread, seti, 0)
     assert(event == EventCode.STREAM_WRITE)
     assert(mem[0] == wsi)
     result,n = unpack_result(mem[4])
     assert(result == CopyResult.DROPPED)
     assert(n == 3)
-    [] = canon_stream_drop_writable(StreamType(None), task, wsi)
+    [] = canon_stream_drop_writable(StreamType(None), thread, wsi)
 
-    [] = canon_waitable_set_drop(task, seti)
+    [] = canon_waitable_set_drop(thread, seti)
     return []
 
   run_lift(sync_opts, inst, ft, core_func, lambda:[], lambda _:())
@@ -2514,6 +2581,150 @@ def core_func(thread, args):
   inst = ComponentInstance(store)
   run_lift(opts, inst, FuncType([], []), core_func, lambda:[], lambda _:())
 
+def test_threads():
+  store = Store()
+  inst = ComponentInstance(store)
+  mem = bytearray(8)
+  opts = mk_opts(memory = mem)
+
+  ftbl = Table()
+  ft = CoreFuncType(['i32'],[])
+
+  def thread_func1(thread, args):
+    assert(args == [13])
+    return []
+  fi1 = ftbl.add(CoreFuncRef(ft, thread_func1))
+
+  def thread_func2(thread, args):
+    [mainthreadi] = args
+    [ret] = canon_thread_yield_to(True, thread, mainthreadi)
+    assert(ret == SuspendResult.COMPLETED)
+    return []
+  fi2 = ftbl.add(CoreFuncRef(ft, thread_func2))
+
+  def thread_func3(thread, args):
+    [mainthreadi] = args
+    [] = canon_thread_resume_later(thread, mainthreadi)
+    return []
+  fi3 = ftbl.add(CoreFuncRef(ft, thread_func3))
+
+  def thread_func4(thread, args):
+    [ptr] = args
+    [ret] = canon_thread_yield(False, thread)
+    assert(ret == SuspendResult.COMPLETED)
+    mem[ptr] = mem[ptr] + 1
+    [ret] = canon_thread_yield(False, thread)
+    assert(ret == SuspendResult.COMPLETED)
+    mem[ptr] = mem[ptr] + 1
+    return []
+  fi4 = ftbl.add(CoreFuncRef(ft, thread_func4))
+
+  def core_func(thread, args):
+    assert(not args)
+
+    [mainthreadi] = canon_thread_index(thread)
+
+    [threadi] = canon_thread_new_indirect(ft, ftbl, thread, fi1, 13)
+    [ret] = canon_thread_yield_to(True, thread, threadi)
+    assert(ret == SuspendResult.COMPLETED)
+
+    [threadi] = canon_thread_new_indirect(ft, ftbl, thread, fi2, mainthreadi)
+    [ret] = canon_thread_switch_to(True, thread, threadi)
+    assert(ret == SuspendResult.COMPLETED)
+
+    [threadi] = canon_thread_new_indirect(ft, ftbl, thread, fi3, mainthreadi)
+    [] = canon_thread_resume_later(thread, threadi)
+    [ret] = canon_thread_suspend(True, thread)
+    assert(ret == SuspendResult.COMPLETED)
+
+    ptr = 4
+    mem[ptr] = 0
+    for i in range(5):
+      [threadi] = canon_thread_new_indirect(ft, ftbl, thread, fi4, ptr)
+      [] = canon_thread_resume_later(thread, threadi)
+    while mem[ptr] != 10:
+      canon_thread_yield(False, thread)
+
+    return [42]
+
+  result = None
+  def on_resolve(v):
+    nonlocal result
+    [result] = v
+
+  run_lift(opts, inst, FuncType([], [U8Type()]), core_func, lambda:[], on_resolve)
+  assert(result == 42)
+
+def test_thread_cancel_callback():
+  store = Store()
+  producer_inst = ComponentInstance(store)
+  producer_ft = FuncType([],[U32Type()])
+
+  producer_opts1 = mk_opts(sync = False)
+  def core_producer1(thread, args):
+    assert(not args)
+    return [CallbackCode.YIELD]
+  def core_producer_callback1(thread, args):
+    [event,payload1,payload2] = args
+    assert(event == EventCode.NONE and payload1 == 0 and payload2 == 0)
+    [] = canon_task_return(thread, [U32Type()], producer_opts1, [42])
+    return [CallbackCode.EXIT]
+  producer_opts1.callback = core_producer_callback1
+  producer_callee1 = partial(canon_lift, producer_opts1, producer_inst, producer_ft, core_producer1)
+
+  producer_opts2 = mk_opts(sync = False)
+  def core_producer2(thread, args):
+    assert(not args)
+    [ret] = canon_thread_yield(False, thread)
+    assert(ret == SuspendResult.COMPLETED)
+    [] = canon_task_return(thread, [U32Type()], producer_opts2, [43])
+    return [CallbackCode.EXIT]
+  def core_producer_callback2(thread, args):
+    assert(False)
+  producer_opts2.callback = core_producer_callback2
+  producer_callee2 = partial(canon_lift, producer_opts2, producer_inst, producer_ft, core_producer2)
+
+  consumer_inst = ComponentInstance(store)
+  consumer_ft = FuncType([], [])
+  consumer_mem = bytearray(24)
+  consumer_opts = mk_opts(consumer_mem, sync = False)
+
+  def core_consumer(thread, args):
+    assert(len(args) == 0)
+
+    retp1 = 8
+    [ret] = canon_lower(consumer_opts, producer_ft, producer_callee1, thread, [retp1])
+    state,subi1 = unpack_result(ret)
+    assert(state == Subtask.State.STARTED)
+
+    retp2 = 12
+    [ret] = canon_lower(consumer_opts, producer_ft, producer_callee2, thread, [retp2])
+    state,subi2 = unpack_result(ret)
+    assert(state == Subtask.State.STARTED)
+
+    [ret] = canon_subtask_cancel(False, thread, subi1)
+    assert(ret == definitions.BLOCKED)
+
+    retp3 = 16
+    [seti] = canon_waitable_set_new(thread)
+    [] = canon_waitable_join(thread, subi1, seti)
+    [event] = canon_waitable_set_wait(True, consumer_mem, thread, seti, retp3)
+    assert(event == EventCode.SUBTASK)
+    assert(consumer_mem[retp3] == subi1)
+    assert(consumer_mem[retp3+4] == Subtask.State.RETURNED)
+    assert(consumer_mem[retp1] == 42)
+
+    [] = canon_waitable_join(thread, subi2, seti)
+    [event] = canon_waitable_set_wait(True, consumer_mem, thread, seti, retp3)
+    assert(event == EventCode.SUBTASK)
+    assert(consumer_mem[retp3] == subi2)
+    assert(consumer_mem[retp3+4] == Subtask.State.RETURNED)
+    assert(consumer_mem[retp2] == 43)
+
+    return []
+
+  run_lift(mk_opts(), consumer_inst, consumer_ft, core_consumer, lambda:[], lambda _:())
+
 test_roundtrips()
 test_handles()
 test_async_to_async()
@@ -2534,5 +2745,7 @@ def core_func(thread, args):
 test_cancel_subtask()
 test_self_empty()
 test_async_flat_params()
+test_threads()
+test_thread_cancel_callback()
 
 print("All tests passed")