diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs
index 119d98c7276a..7e3bbf4a12b8 100644
--- a/crates/wast-util/src/lib.rs
+++ b/crates/wast-util/src/lib.rs
@@ -423,7 +423,6 @@ impl WastTest {
                 "misc_testsuite/simd/almost-extmul.wast",
                 "misc_testsuite/simd/canonicalize-nan.wast",
                 "misc_testsuite/simd/issue_3327_bnot_lowering.wast",
-                "spec_testsuite/simd_bit_shift.wast",
                 "spec_testsuite/simd_boolean.wast",
                 "spec_testsuite/simd_f32x4.wast",
                 "spec_testsuite/simd_f32x4_arith.wast",
@@ -433,25 +432,19 @@ impl WastTest {
                 "spec_testsuite/simd_f64x2_arith.wast",
                 "spec_testsuite/simd_f64x2_pmin_pmax.wast",
                 "spec_testsuite/simd_f64x2_rounding.wast",
-                "spec_testsuite/simd_i16x8_arith.wast",
                 "spec_testsuite/simd_i16x8_arith2.wast",
                 "spec_testsuite/simd_i16x8_extadd_pairwise_i8x16.wast",
                 "spec_testsuite/simd_i16x8_extmul_i8x16.wast",
                 "spec_testsuite/simd_i16x8_q15mulr_sat_s.wast",
-                "spec_testsuite/simd_i16x8_sat_arith.wast",
-                "spec_testsuite/simd_i32x4_arith.wast",
                 "spec_testsuite/simd_i32x4_arith2.wast",
                 "spec_testsuite/simd_i32x4_dot_i16x8.wast",
                 "spec_testsuite/simd_i32x4_extadd_pairwise_i16x8.wast",
                 "spec_testsuite/simd_i32x4_extmul_i16x8.wast",
                 "spec_testsuite/simd_i32x4_trunc_sat_f32x4.wast",
                 "spec_testsuite/simd_i32x4_trunc_sat_f64x2.wast",
-                "spec_testsuite/simd_i64x2_arith.wast",
                 "spec_testsuite/simd_i64x2_arith2.wast",
                 "spec_testsuite/simd_i64x2_extmul_i32x4.wast",
-                "spec_testsuite/simd_i8x16_arith.wast",
                 "spec_testsuite/simd_i8x16_arith2.wast",
-                "spec_testsuite/simd_i8x16_sat_arith.wast",
                 "spec_testsuite/simd_lane.wast",
                 "spec_testsuite/simd_load.wast",
                 "spec_testsuite/simd_load_zero.wast",
@@ -499,6 +492,13 @@ impl WastTest {
                     "multi-memory/simd_memory-multi.wast",
                     "misc_testsuite/simd/issue4807.wast",
                     "spec_testsuite/simd_const.wast",
+                    "spec_testsuite/simd_i8x16_sat_arith.wast",
+                    "spec_testsuite/simd_i64x2_arith.wast",
+                    "spec_testsuite/simd_i16x8_arith.wast",
+                    "spec_testsuite/simd_i32x4_arith.wast",
+                    "spec_testsuite/simd_i16x8_sat_arith.wast",
+                    "spec_testsuite/simd_i8x16_arith.wast",
+                    "spec_testsuite/simd_bit_shift.wast",
                 ];
 
                 if unsupported.iter().any(|part| self.path.ends_with(part)) {
diff --git a/tests/disas/winch/x64/i16x8/neg/neg.wat b/tests/disas/winch/x64/i16x8/neg/neg.wat
new file mode 100644
index 000000000000..c24ef451d87b
--- /dev/null
+++ b/tests/disas/winch/x64/i16x8/neg/neg.wat
@@ -0,0 +1,33 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = [ "-Ccranelift-has-avx" ]
+
+(module
+  (func (result v128)
+        (i16x8.neg (v128.const i64x2 0xFFFFFFFFFFFFFFFF 42)
+          )))
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x10(%r11), %r11
+;;       addq    $0x10, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x43
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x10, %rsp
+;;       movq    %rdi, 8(%rsp)
+;;       movq    %rsi, (%rsp)
+;;       movdqu  0x1c(%rip), %xmm0
+;;       vpxor   %xmm15, %xmm15, %xmm15
+;;       vpsubw  %xmm0, %xmm15, %xmm0
+;;       addq    $0x10, %rsp
+;;       popq    %rbp
+;;       retq
+;;   43: ud2
+;;   45: addb    %al, (%rax)
+;;   47: addb    %al, (%rax)
+;;   49: addb    %al, (%rax)
+;;   4b: addb    %al, (%rax)
+;;   4d: addb    %al, (%rax)
+;;   4f: addb    %bh, %bh
diff --git a/tests/disas/winch/x64/i16x8/shift/shl.wat b/tests/disas/winch/x64/i16x8/shift/shl.wat
new file mode 100644
index 000000000000..e087685d165f
--- /dev/null
+++ b/tests/disas/winch/x64/i16x8/shift/shl.wat
@@ -0,0 +1,39 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = [ "-Ccranelift-has-avx" ]
+
+(module
+    (func (result v128)
+        (i16x8.shl (v128.const i64x2 1 2) (i32.const 3))
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x10(%r11), %r11
+;;       addq    $0x10, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x4b
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x10, %rsp
+;;       movq    %rdi, 8(%rsp)
+;;       movq    %rsi, (%rsp)
+;;       movl    $3, %eax
+;;       movdqu  0x17(%rip), %xmm0
+;;       andl    $0xf, %eax
+;;       vmovd   %eax, %xmm15
+;;       vpsllw  %xmm15, %xmm0, %xmm0
+;;       addq    $0x10, %rsp
+;;       popq    %rbp
+;;       retq
+;;   4b: ud2
+;;   4d: addb    %al, (%rax)
+;;   4f: addb    %al, (%rcx)
+;;   51: addb    %al, (%rax)
+;;   53: addb    %al, (%rax)
+;;   55: addb    %al, (%rax)
+;;   57: addb    %al, (%rdx)
+;;   59: addb    %al, (%rax)
+;;   5b: addb    %al, (%rax)
+;;   5d: addb    %al, (%rax)
diff --git a/tests/disas/winch/x64/i16x8/shift/shr_s.wat b/tests/disas/winch/x64/i16x8/shift/shr_s.wat
new file mode 100644
index 000000000000..c18823f977f5
--- /dev/null
+++ b/tests/disas/winch/x64/i16x8/shift/shr_s.wat
@@ -0,0 +1,39 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = [ "-Ccranelift-has-avx" ]
+
+(module
+    (func (result v128)
+        (i16x8.shr_s (v128.const i64x2 1 2) (i32.const 3))
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x10(%r11), %r11
+;;       addq    $0x10, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x4b
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x10, %rsp
+;;       movq    %rdi, 8(%rsp)
+;;       movq    %rsi, (%rsp)
+;;       movl    $3, %eax
+;;       movdqu  0x17(%rip), %xmm0
+;;       andl    $0xf, %eax
+;;       vmovd   %eax, %xmm15
+;;       vpsraw  %xmm15, %xmm0, %xmm0
+;;       addq    $0x10, %rsp
+;;       popq    %rbp
+;;       retq
+;;   4b: ud2
+;;   4d: addb    %al, (%rax)
+;;   4f: addb    %al, (%rcx)
+;;   51: addb    %al, (%rax)
+;;   53: addb    %al, (%rax)
+;;   55: addb    %al, (%rax)
+;;   57: addb    %al, (%rdx)
+;;   59: addb    %al, (%rax)
+;;   5b: addb    %al, (%rax)
+;;   5d: addb    %al, (%rax)
diff --git a/tests/disas/winch/x64/i16x8/shift/shr_u.wat b/tests/disas/winch/x64/i16x8/shift/shr_u.wat
new file mode 100644
index 000000000000..b154bc918758
--- /dev/null
+++ b/tests/disas/winch/x64/i16x8/shift/shr_u.wat
@@ -0,0 +1,39 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = [ "-Ccranelift-has-avx" ]
+
+(module
+    (func (result v128)
+        (i16x8.shr_u (v128.const i64x2 1 2) (i32.const 3))
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x10(%r11), %r11
+;;       addq    $0x10, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x4b
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x10, %rsp
+;;       movq    %rdi, 8(%rsp)
+;;       movq    %rsi, (%rsp)
+;;       movl    $3, %eax
+;;       movdqu  0x17(%rip), %xmm0
+;;       andl    $0xf, %eax
+;;       vmovd   %eax, %xmm15
+;;       vpsrlw  %xmm15, %xmm0, %xmm0
+;;       addq    $0x10, %rsp
+;;       popq    %rbp
+;;       retq
+;;   4b: ud2
+;;   4d: addb    %al, (%rax)
+;;   4f: addb    %al, (%rcx)
+;;   51: addb    %al, (%rax)
+;;   53: addb    %al, (%rax)
+;;   55: addb    %al, (%rax)
+;;   57: addb    %al, (%rdx)
+;;   59: addb    %al, (%rax)
+;;   5b: addb    %al, (%rax)
+;;   5d: addb    %al, (%rax)
diff --git a/tests/disas/winch/x64/i32x4/neg/neg.wat b/tests/disas/winch/x64/i32x4/neg/neg.wat
new file mode 100644
index 000000000000..9d55702fa383
--- /dev/null
+++ b/tests/disas/winch/x64/i32x4/neg/neg.wat
@@ -0,0 +1,33 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = [ "-Ccranelift-has-avx" ]
+
+(module
+  (func (result v128)
+        (i32x4.neg (v128.const i64x2 0xFFFFFFFFFFFFFFFF 42)
+          )))
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x10(%r11), %r11
+;;       addq    $0x10, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x43
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x10, %rsp
+;;       movq    %rdi, 8(%rsp)
+;;       movq    %rsi, (%rsp)
+;;       movdqu  0x1c(%rip), %xmm0
+;;       vpxor   %xmm15, %xmm15, %xmm15
+;;       vpsubd  %xmm0, %xmm15, %xmm0
+;;       addq    $0x10, %rsp
+;;       popq    %rbp
+;;       retq
+;;   43: ud2
+;;   45: addb    %al, (%rax)
+;;   47: addb    %al, (%rax)
+;;   49: addb    %al, (%rax)
+;;   4b: addb    %al, (%rax)
+;;   4d: addb    %al, (%rax)
+;;   4f: addb    %bh, %bh
diff --git a/tests/disas/winch/x64/i32x4/shift/shl.wat b/tests/disas/winch/x64/i32x4/shift/shl.wat
new file mode 100644
index 000000000000..72e64363fd5c
--- /dev/null
+++ b/tests/disas/winch/x64/i32x4/shift/shl.wat
@@ -0,0 +1,39 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = [ "-Ccranelift-has-avx" ]
+
+(module
+    (func (result v128)
+        (i32x4.shl (v128.const i64x2 1 2) (i32.const 3))
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x10(%r11), %r11
+;;       addq    $0x10, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x4b
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x10, %rsp
+;;       movq    %rdi, 8(%rsp)
+;;       movq    %rsi, (%rsp)
+;;       movl    $3, %eax
+;;       movdqu  0x17(%rip), %xmm0
+;;       andl    $0x1f, %eax
+;;       vmovd   %eax, %xmm15
+;;       vpslld  %xmm15, %xmm0, %xmm0
+;;       addq    $0x10, %rsp
+;;       popq    %rbp
+;;       retq
+;;   4b: ud2
+;;   4d: addb    %al, (%rax)
+;;   4f: addb    %al, (%rcx)
+;;   51: addb    %al, (%rax)
+;;   53: addb    %al, (%rax)
+;;   55: addb    %al, (%rax)
+;;   57: addb    %al, (%rdx)
+;;   59: addb    %al, (%rax)
+;;   5b: addb    %al, (%rax)
+;;   5d: addb    %al, (%rax)
diff --git a/tests/disas/winch/x64/i32x4/shift/shr_s.wat b/tests/disas/winch/x64/i32x4/shift/shr_s.wat
new file mode 100644
index 000000000000..5c87156bb713
--- /dev/null
+++ b/tests/disas/winch/x64/i32x4/shift/shr_s.wat
@@ -0,0 +1,39 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = [ "-Ccranelift-has-avx" ]
+
+(module
+    (func (result v128)
+        (i32x4.shr_s (v128.const i64x2 1 2) (i32.const 3))
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x10(%r11), %r11
+;;       addq    $0x10, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x4b
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x10, %rsp
+;;       movq    %rdi, 8(%rsp)
+;;       movq    %rsi, (%rsp)
+;;       movl    $3, %eax
+;;       movdqu  0x17(%rip), %xmm0
+;;       andl    $0x1f, %eax
+;;       vmovd   %eax, %xmm15
+;;       vpsrad  %xmm15, %xmm0, %xmm0
+;;       addq    $0x10, %rsp
+;;       popq    %rbp
+;;       retq
+;;   4b: ud2
+;;   4d: addb    %al, (%rax)
+;;   4f: addb    %al, (%rcx)
+;;   51: addb    %al, (%rax)
+;;   53: addb    %al, (%rax)
+;;   55: addb    %al, (%rax)
+;;   57: addb    %al, (%rdx)
+;;   59: addb    %al, (%rax)
+;;   5b: addb    %al, (%rax)
+;;   5d: addb    %al, (%rax)
diff --git a/tests/disas/winch/x64/i32x4/shift/shr_u.wat b/tests/disas/winch/x64/i32x4/shift/shr_u.wat
new file mode 100644
index 000000000000..a12e6cfab9f7
--- /dev/null
+++ b/tests/disas/winch/x64/i32x4/shift/shr_u.wat
@@ -0,0 +1,39 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = [ "-Ccranelift-has-avx" ]
+
+(module
+    (func (result v128)
+        (i32x4.shr_u (v128.const i64x2 1 2) (i32.const 3))
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x10(%r11), %r11
+;;       addq    $0x10, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x4b
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x10, %rsp
+;;       movq    %rdi, 8(%rsp)
+;;       movq    %rsi, (%rsp)
+;;       movl    $3, %eax
+;;       movdqu  0x17(%rip), %xmm0
+;;       andl    $0x1f, %eax
+;;       vmovd   %eax, %xmm15
+;;       vpsrld  %xmm15, %xmm0, %xmm0
+;;       addq    $0x10, %rsp
+;;       popq    %rbp
+;;       retq
+;;   4b: ud2
+;;   4d: addb    %al, (%rax)
+;;   4f: addb    %al, (%rcx)
+;;   51: addb    %al, (%rax)
+;;   53: addb    %al, (%rax)
+;;   55: addb    %al, (%rax)
+;;   57: addb    %al, (%rdx)
+;;   59: addb    %al, (%rax)
+;;   5b: addb    %al, (%rax)
+;;   5d: addb    %al, (%rax)
diff --git a/tests/disas/winch/x64/i64x2/neg/neg.wat b/tests/disas/winch/x64/i64x2/neg/neg.wat
new file mode 100644
index 000000000000..72e02f862dff
--- /dev/null
+++ b/tests/disas/winch/x64/i64x2/neg/neg.wat
@@ -0,0 +1,33 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = [ "-Ccranelift-has-avx" ]
+
+(module
+  (func (result v128)
+        (i64x2.neg (v128.const i64x2 0xFFFFFFFFFFFFFFFF 42)
+          )))
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x10(%r11), %r11
+;;       addq    $0x10, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x43
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x10, %rsp
+;;       movq    %rdi, 8(%rsp)
+;;       movq    %rsi, (%rsp)
+;;       movdqu  0x1c(%rip), %xmm0
+;;       vpxor   %xmm15, %xmm15, %xmm15
+;;       vpsubq  %xmm0, %xmm15, %xmm0
+;;       addq    $0x10, %rsp
+;;       popq    %rbp
+;;       retq
+;;   43: ud2
+;;   45: addb    %al, (%rax)
+;;   47: addb    %al, (%rax)
+;;   49: addb    %al, (%rax)
+;;   4b: addb    %al, (%rax)
+;;   4d: addb    %al, (%rax)
+;;   4f: addb    %bh, %bh
diff --git a/tests/disas/winch/x64/i64x2/shift/shl.wat b/tests/disas/winch/x64/i64x2/shift/shl.wat
new file mode 100644
index 000000000000..6db216c0514a
--- /dev/null
+++ b/tests/disas/winch/x64/i64x2/shift/shl.wat
@@ -0,0 +1,39 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = [ "-Ccranelift-has-avx" ]
+
+(module
+    (func (result v128)
+        (i64x2.shl (v128.const i64x2 1 2) (i32.const 3))
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x10(%r11), %r11
+;;       addq    $0x10, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x4b
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x10, %rsp
+;;       movq    %rdi, 8(%rsp)
+;;       movq    %rsi, (%rsp)
+;;       movl    $3, %eax
+;;       movdqu  0x17(%rip), %xmm0
+;;       andl    $0x3f, %eax
+;;       vmovd   %eax, %xmm15
+;;       vpsllq  %xmm15, %xmm0, %xmm0
+;;       addq    $0x10, %rsp
+;;       popq    %rbp
+;;       retq
+;;   4b: ud2
+;;   4d: addb    %al, (%rax)
+;;   4f: addb    %al, (%rcx)
+;;   51: addb    %al, (%rax)
+;;   53: addb    %al, (%rax)
+;;   55: addb    %al, (%rax)
+;;   57: addb    %al, (%rdx)
+;;   59: addb    %al, (%rax)
+;;   5b: addb    %al, (%rax)
+;;   5d: addb    %al, (%rax)
diff --git a/tests/disas/winch/x64/i64x2/shift/shr_s.wat b/tests/disas/winch/x64/i64x2/shift/shr_s.wat
new file mode 100644
index 000000000000..e70e35e90562
--- /dev/null
+++ b/tests/disas/winch/x64/i64x2/shift/shr_s.wat
@@ -0,0 +1,54 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = [ "-Ccranelift-has-avx" ]
+
+(module
+    (func (result v128)
+        (i64x2.shr_s (v128.const i64x2 1 2) (i32.const 3))
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x10(%r11), %r11
+;;       addq    $0x10, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x60
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x10, %rsp
+;;       movq    %rdi, 8(%rsp)
+;;       movq    %rsi, (%rsp)
+;;       movl    $3, %eax
+;;       movdqu  0x37(%rip), %xmm0
+;;       andl    $0x3f, %eax
+;;       vmovd   %eax, %xmm15
+;;       vmovdqu 0x38(%rip), %xmm1
+;;       vpsrlq  %xmm15, %xmm1, %xmm1
+;;       vpsrlq  %xmm15, %xmm0, %xmm0
+;;       vpxor   %xmm1, %xmm0, %xmm0
+;;       vpsubq  %xmm1, %xmm0, %xmm0
+;;       addq    $0x10, %rsp
+;;       popq    %rbp
+;;       retq
+;;   60: ud2
+;;   62: addb    %al, (%rax)
+;;   64: addb    %al, (%rax)
+;;   66: addb    %al, (%rax)
+;;   68: addb    %al, (%rax)
+;;   6a: addb    %al, (%rax)
+;;   6c: addb    %al, (%rax)
+;;   6e: addb    %al, (%rax)
+;;   70: addl    %eax, (%rax)
+;;   72: addb    %al, (%rax)
+;;   74: addb    %al, (%rax)
+;;   76: addb    %al, (%rax)
+;;   78: addb    (%rax), %al
+;;   7a: addb    %al, (%rax)
+;;   7c: addb    %al, (%rax)
+;;   7e: addb    %al, (%rax)
+;;   80: addb    %al, (%rax)
+;;   82: addb    %al, (%rax)
+;;   84: addb    %al, (%rax)
+;;   86: addb    %al, (%rax)
+;;   8c: addb    %al, (%rax)
diff --git a/tests/disas/winch/x64/i64x2/shift/shr_u.wat b/tests/disas/winch/x64/i64x2/shift/shr_u.wat
new file mode 100644
index 000000000000..a9ab3e496966
--- /dev/null
+++ b/tests/disas/winch/x64/i64x2/shift/shr_u.wat
@@ -0,0 +1,39 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = [ "-Ccranelift-has-avx" ]
+
+(module
+    (func (result v128)
+        (i64x2.shr_u (v128.const i64x2 1 2) (i32.const 3))
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x10(%r11), %r11
+;;       addq    $0x10, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x4b
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x10, %rsp
+;;       movq    %rdi, 8(%rsp)
+;;       movq    %rsi, (%rsp)
+;;       movl    $3, %eax
+;;       movdqu  0x17(%rip), %xmm0
+;;       andl    $0x3f, %eax
+;;       vmovd   %eax, %xmm15
+;;       vpsrlq  %xmm15, %xmm0, %xmm0
+;;       addq    $0x10, %rsp
+;;       popq    %rbp
+;;       retq
+;;   4b: ud2
+;;   4d: addb    %al, (%rax)
+;;   4f: addb    %al, (%rcx)
+;;   51: addb    %al, (%rax)
+;;   53: addb    %al, (%rax)
+;;   55: addb    %al, (%rax)
+;;   57: addb    %al, (%rdx)
+;;   59: addb    %al, (%rax)
+;;   5b: addb    %al, (%rax)
+;;   5d: addb    %al, (%rax)
diff --git a/tests/disas/winch/x64/i8x16/neg/neg.wat b/tests/disas/winch/x64/i8x16/neg/neg.wat
new file mode 100644
index 000000000000..c89173dfb3f9
--- /dev/null
+++ b/tests/disas/winch/x64/i8x16/neg/neg.wat
@@ -0,0 +1,33 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = [ "-Ccranelift-has-avx" ]
+
+(module
+  (func (result v128)
+        (i8x16.neg (v128.const i64x2 0xFFFFFFFFFFFFFFFF 42)
+          )))
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x10(%r11), %r11
+;;       addq    $0x10, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x43
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x10, %rsp
+;;       movq    %rdi, 8(%rsp)
+;;       movq    %rsi, (%rsp)
+;;       movdqu  0x1c(%rip), %xmm0
+;;       vpxor   %xmm15, %xmm15, %xmm15
+;;       vpsubb  %xmm0, %xmm15, %xmm0
+;;       addq    $0x10, %rsp
+;;       popq    %rbp
+;;       retq
+;;   43: ud2
+;;   45: addb    %al, (%rax)
+;;   47: addb    %al, (%rax)
+;;   49: addb    %al, (%rax)
+;;   4b: addb    %al, (%rax)
+;;   4d: addb    %al, (%rax)
+;;   4f: addb    %bh, %bh
diff --git a/tests/disas/winch/x64/i8x16/shift/shl.wat b/tests/disas/winch/x64/i8x16/shift/shl.wat
new file mode 100644
index 000000000000..3e62dee7e68c
--- /dev/null
+++ b/tests/disas/winch/x64/i8x16/shift/shl.wat
@@ -0,0 +1,50 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = [ "-Ccranelift-has-avx" ]
+
+(module
+    (func (result v128)
+        (i8x16.shl (v128.const i64x2 1 2) (i32.const 3))
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x10(%r11), %r11
+;;       addq    $0x10, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x5f
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x10, %rsp
+;;       movq    %rdi, 8(%rsp)
+;;       movq    %rsi, (%rsp)
+;;       movl    $3, %eax
+;;       movdqu  0x37(%rip), %xmm0
+;;       andl    $7, %eax
+;;       vmovd   %eax, %xmm15
+;;       vpsllw  %xmm15, %xmm0, %xmm0
+;;       leaq    0x34(%rip), %r11
+;;       shll    $4, %eax
+;;       vmovdqu (%r11, %rax), %xmm15
+;;       vpand   %xmm0, %xmm15, %xmm0
+;;       addq    $0x10, %rsp
+;;       popq    %rbp
+;;       retq
+;;   5f: ud2
+;;   61: addb    %al, (%rax)
+;;   63: addb    %al, (%rax)
+;;   65: addb    %al, (%rax)
+;;   67: addb    %al, (%rax)
+;;   69: addb    %al, (%rax)
+;;   6b: addb    %al, (%rax)
+;;   6d: addb    %al, (%rax)
+;;   6f: addb    %al, (%rcx)
+;;   71: addb    %al, (%rax)
+;;   73: addb    %al, (%rax)
+;;   75: addb    %al, (%rax)
+;;   77: addb    %al, (%rdx)
+;;   79: addb    %al, (%rax)
+;;   7b: addb    %al, (%rax)
+;;   7d: addb    %al, (%rax)
+;;   7f: addb    %bh, %bh
diff --git a/tests/disas/winch/x64/i8x16/shift/shr_s.wat b/tests/disas/winch/x64/i8x16/shift/shr_s.wat
new file mode 100644
index 000000000000..c9db7668b5e3
--- /dev/null
+++ b/tests/disas/winch/x64/i8x16/shift/shr_s.wat
@@ -0,0 +1,50 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = [ "-Ccranelift-has-avx" ]
+
+(module
+    (func (result v128)
+        (i8x16.shr_s (v128.const i64x2 1 2) (i32.const 3))
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x10(%r11), %r11
+;;       addq    $0x10, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x5f
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x10, %rsp
+;;       movq    %rdi, 8(%rsp)
+;;       movq    %rsi, (%rsp)
+;;       movl    $3, %eax
+;;       movdqu  0x37(%rip), %xmm0
+;;       andl    $7, %eax
+;;       addl    $8, %eax
+;;       vmovd   %eax, %xmm15
+;;       vpunpcklbw %xmm0, %xmm0, %xmm1
+;;       vpunpckhbw %xmm0, %xmm0, %xmm2
+;;       vpsraw  %xmm15, %xmm1, %xmm1
+;;       vpsraw  %xmm15, %xmm2, %xmm2
+;;       vpacksswb %xmm2, %xmm1, %xmm0
+;;       addq    $0x10, %rsp
+;;       popq    %rbp
+;;       retq
+;;   5f: ud2
+;;   61: addb    %al, (%rax)
+;;   63: addb    %al, (%rax)
+;;   65: addb    %al, (%rax)
+;;   67: addb    %al, (%rax)
+;;   69: addb    %al, (%rax)
+;;   6b: addb    %al, (%rax)
+;;   6d: addb    %al, (%rax)
+;;   6f: addb    %al, (%rcx)
+;;   71: addb    %al, (%rax)
+;;   73: addb    %al, (%rax)
+;;   75: addb    %al, (%rax)
+;;   77: addb    %al, (%rdx)
+;;   79: addb    %al, (%rax)
+;;   7b: addb    %al, (%rax)
+;;   7d: addb    %al, (%rax)
diff --git a/tests/disas/winch/x64/i8x16/shift/shr_u.wat b/tests/disas/winch/x64/i8x16/shift/shr_u.wat
new file mode 100644
index 000000000000..50aa63c9fde9
--- /dev/null
+++ b/tests/disas/winch/x64/i8x16/shift/shr_u.wat
@@ -0,0 +1,50 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = [ "-Ccranelift-has-avx" ]
+
+(module
+    (func (result v128)
+        (i8x16.shr_u (v128.const i64x2 1 2) (i32.const 3))
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x10(%r11), %r11
+;;       addq    $0x10, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x5f
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x10, %rsp
+;;       movq    %rdi, 8(%rsp)
+;;       movq    %rsi, (%rsp)
+;;       movl    $3, %eax
+;;       movdqu  0x37(%rip), %xmm0
+;;       andl    $7, %eax
+;;       vmovd   %eax, %xmm15
+;;       vpsrlw  %xmm15, %xmm0, %xmm0
+;;       leaq    0x34(%rip), %r11
+;;       shll    $4, %eax
+;;       vmovdqu (%r11, %rax), %xmm15
+;;       vpand   %xmm0, %xmm15, %xmm0
+;;       addq    $0x10, %rsp
+;;       popq    %rbp
+;;       retq
+;;   5f: ud2
+;;   61: addb    %al, (%rax)
+;;   63: addb    %al, (%rax)
+;;   65: addb    %al, (%rax)
+;;   67: addb    %al, (%rax)
+;;   69: addb    %al, (%rax)
+;;   6b: addb    %al, (%rax)
+;;   6d: addb    %al, (%rax)
+;;   6f: addb    %al, (%rcx)
+;;   71: addb    %al, (%rax)
+;;   73: addb    %al, (%rax)
+;;   75: addb    %al, (%rax)
+;;   77: addb    %al, (%rdx)
+;;   79: addb    %al, (%rax)
+;;   7b: addb    %al, (%rax)
+;;   7d: addb    %al, (%rax)
+;;   7f: addb    %bh, %bh
diff --git a/winch/codegen/src/isa/aarch64/masm.rs b/winch/codegen/src/isa/aarch64/masm.rs
index f4e9484fe7e0..737521760543 100644
--- a/winch/codegen/src/isa/aarch64/masm.rs
+++ b/winch/codegen/src/isa/aarch64/masm.rs
@@ -1157,6 +1157,19 @@ impl Masm for MacroAssembler {
     ) -> Result<()> {
         Err(anyhow!(CodeGenError::unimplemented_masm_instruction()))
     }
+
+    fn v128_neg(&mut self, _op: WritableReg, _size: OperandSize) -> Result<()> {
+        Err(anyhow!(CodeGenError::unimplemented_masm_instruction()))
+    }
+
+    fn v128_shift(
+        &mut self,
+        _context: &mut CodeGenContext<Emission>,
+        _lane_width: OperandSize,
+        _shift_kind: ShiftKind,
+    ) -> Result<()> {
+        Err(anyhow!(CodeGenError::unimplemented_masm_instruction()))
+    }
 }
 
 impl MacroAssembler {
diff --git a/winch/codegen/src/isa/x64/address.rs b/winch/codegen/src/isa/x64/address.rs
index 47229ae37448..e7ce8d3c5ea0 100644
--- a/winch/codegen/src/isa/x64/address.rs
+++ b/winch/codegen/src/isa/x64/address.rs
@@ -10,6 +10,13 @@ pub(crate) enum Address {
     Offset { base: Reg, offset: u32 },
     /// Address to identify a constant.
     Const(Constant),
+    /// Address at `(base + index * 2^shift) + simm32`
+    ImmRegRegShift {
+        simm32: i32,
+        base: Reg,
+        index: Reg,
+        shift: u8,
+    },
 }
 
 impl Address {
diff --git a/winch/codegen/src/isa/x64/asm.rs b/winch/codegen/src/isa/x64/asm.rs
index ed9547c92f16..12e086b66a12 100644
--- a/winch/codegen/src/isa/x64/asm.rs
+++ b/winch/codegen/src/isa/x64/asm.rs
@@ -323,9 +323,9 @@ impl Assembler {
         buffer: &mut MachBuffer<Inst>,
         memflags: MemFlags,
     ) -> SyntheticAmode {
-        match addr {
+        match *addr {
             Address::Offset { base, offset } => {
-                let amode = Amode::imm_reg(*offset as i32, (*base).into()).with_flags(memflags);
+                let amode = Amode::imm_reg(offset as i32, base.into()).with_flags(memflags);
                 SyntheticAmode::real(amode)
             }
             Address::Const(c) => {
@@ -333,18 +333,30 @@ impl Assembler {
                 // `SyntheticAmode::ConstantOffset` addressing mode
                 // until the address is referenced by an actual
                 // instruction.
-                let constant_data = pool.get(*c);
-                let data = VCodeConstantData::Pool(*c, constant_data.clone());
+                let constant_data = pool.get(c);
+                let data = VCodeConstantData::Pool(c, constant_data.clone());
                 // If the constant data is not marked as used, it will be
                 // inserted, therefore, it needs to be registered.
                 let needs_registration = !constants.pool_uses(&data);
-                let constant = constants.insert(VCodeConstantData::Pool(*c, constant_data.clone()));
+                let constant = constants.insert(VCodeConstantData::Pool(c, constant_data.clone()));
 
                 if needs_registration {
                     buffer.register_constant(&constant, &data);
                 }
                 SyntheticAmode::ConstantOffset(constant)
             }
+            Address::ImmRegRegShift {
+                simm32,
+                base,
+                index,
+                shift,
+            } => SyntheticAmode::Real(Amode::ImmRegRegShift {
+                simm32,
+                base: base.into(),
+                index: index.into(),
+                shift,
+                flags: memflags,
+            }),
         }
     }
 
@@ -1937,6 +1949,38 @@ impl Assembler {
         });
     }
 
+    /// Move unaligned packed integer values from address `src` to `dst`.
+    pub fn xmm_vmovdqu_mr(&mut self, src: &Address, dst: WritableReg, flags: MemFlags) {
+        let src = Self::to_synthetic_amode(
+            src,
+            &mut self.pool,
+            &mut self.constants,
+            &mut self.buffer,
+            flags,
+        );
+        self.emit(Inst::XmmUnaryRmRVex {
+            op: AvxOpcode::Vmovdqu,
+            src: XmmMem::unwrap_new(RegMem::mem(src)),
+            dst: dst.map(Into::into),
+        });
+    }
+
+    /// Move integer from `src` to xmm register `dst` using an AVX instruction.
+    pub fn avx_gpr_to_xmm(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
+        let op = match size {
+            OperandSize::S32 => AvxOpcode::Vmovd,
+            OperandSize::S64 => AvxOpcode::Vmovq,
+            _ => unreachable!(),
+        };
+
+        self.emit(Inst::GprToXmmVex {
+            op,
+            src: src.into(),
+            dst: dst.map(Into::into),
+            src_size: size.into(),
+        })
+    }
+
     /// The `vpinsr` opcode to use.
     fn vpinsr_opcode(size: OperandSize) -> AvxOpcode {
         match size {
diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs
index 8bd8358647b9..d76d4ec65716 100644
--- a/winch/codegen/src/isa/x64/masm.rs
+++ b/winch/codegen/src/isa/x64/masm.rs
@@ -44,6 +44,38 @@ use cranelift_codegen::{
 use wasmtime_cranelift::TRAP_UNREACHABLE;
 use wasmtime_environ::{PtrSize, WasmValType};
 
+// Taken from `cranelift/codegen/src/isa/x64/lower/isle.rs`
+// Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we
+// need to fix up the bits that migrate from one half of the lane to the
+// other. Each 16-byte mask is indexed by the shift amount: e.g. if we shift
+// right by 0 (no movement), we want to retain all the bits so we mask with
+// `0xff`; if we shift right by 1, we want to retain all bits except the MSB so
+// we mask with `0x7f`; etc.
+
+#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.
+const I8X16_ISHL_MASKS: [u8; 128] = [
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
+    0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
+    0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
+    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
+    0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
+    0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+];
+
+#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.
+const I8X16_USHR_MASKS: [u8; 128] = [
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
+    0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
+    0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+    0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+    0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+];
+
 /// x64 MacroAssembler.
 pub(crate) struct MacroAssembler {
     /// Stack pointer offset.
@@ -2198,6 +2230,202 @@ impl Masm for MacroAssembler {
 
         Ok(())
     }
+
+    fn v128_neg(&mut self, op: WritableReg, size: OperandSize) -> Result<()> {
+        let tmp = regs::scratch_xmm();
+        self.v128_xor(tmp, tmp, writable!(tmp))?;
+        self.v128_sub(tmp, op.to_reg(), op, size, HandleOverflowKind::None)?;
+        Ok(())
+    }
+
+    fn v128_shift(
+        &mut self,
+        context: &mut CodeGenContext<Emission>,
+        lane_width: OperandSize,
+        kind: ShiftKind,
+    ) -> Result<()> {
+        self.ensure_has_avx()?;
+        let shift_amount = context.pop_to_reg(self, None)?.reg;
+        let operand = context.pop_to_reg(self, None)?.reg;
+
+        let tmp_xmm = regs::scratch_xmm();
+        let tmp = regs::scratch();
+        let amount_mask = lane_width.num_bits() - 1;
+        self.and(
+            writable!(shift_amount),
+            shift_amount,
+            RegImm::i32(amount_mask as i32),
+            OperandSize::S32,
+        )?;
+
+        let shl_normal = |this: &mut Self, op: AvxOpcode| {
+            this.asm
+                .avx_gpr_to_xmm(shift_amount, writable!(tmp_xmm), OperandSize::S32);
+            this.asm
+                .xmm_vex_rr(op, operand, tmp_xmm, writable!(operand));
+        };
+
+        let shift_i8x16 = |this: &mut Self, masks: &'static [u8], op: AvxOpcode| {
+            // The case for i8x16 is a little bit trickier because x64 doesn't provide a 8bit
+            // shift instruction. Instead, we shift as 16bits, and then mask the bits in the
+            // 8bits lane, for example (with 2 8bits lanes):
+            // - Before shifting:
+            // 01001101 11101110
+            // - shifting by 2 left:
+            // 00110111 10111000
+            //       ^^_ these bits come from the previous byte, and need to be masked.
+            // - The mask:
+            // 11111100 11111111
+            // - After masking:
+            // 00110100 10111000
+            //
+            // The mask is loaded from a well known memory, depending on the shift amount.
+
+            this.asm
+                .avx_gpr_to_xmm(shift_amount, writable!(tmp_xmm), OperandSize::S32);
+
+            // perform 16 bit shift
+            this.asm
+                .xmm_vex_rr(op, operand, tmp_xmm, writable!(operand));
+
+            // get a handle to the masks array constant.
+            let masks_addr = this.asm.add_constant(masks);
+
+            // Load the masks array effective address into the tmp register.
+            this.asm.lea(&masks_addr, writable!(tmp), OperandSize::S64);
+
+            // Compute the offset of the mask that we need to use. This is shift_amount * 16 ==
+            // shift_amount << 4.
+            this.asm
+                .shift_ir(4, writable!(shift_amount), ShiftKind::Shl, OperandSize::S32);
+
+            // Load the mask to tmp_xmm.
+            this.asm.xmm_vmovdqu_mr(
+                &Address::ImmRegRegShift {
+                    simm32: 0,
+                    base: tmp,
+                    index: shift_amount,
+                    shift: 0,
+                },
+                writable!(tmp_xmm),
+                MemFlags::trusted(),
+            );
+
+            // Mask unwanted bits from operand.
+            this.asm
+                .xmm_vex_rr(AvxOpcode::Vpand, tmp_xmm, operand, writable!(operand));
+        };
+
+        let i64x2_shr_s = |this: &mut Self, context: &mut CodeGenContext<Emission>| -> Result<()> {
+            const SIGN_MASK: u128 = 0x8000000000000000_8000000000000000;
+
+            // AVX doesn't have an instruction for i64x2 signed right shift. Instead we use the
+            // following formula (from hacker's delight 2-7), where x is the value and n the shift
+            // amount, for each lane:
+            // t = (1 << 63) >> n; ((x >> n) ^ t) - t
+
+            // we need an extra scratch register
+            let tmp_xmm2 = context.any_fpr(this)?;
+
+            this.asm
+                .avx_gpr_to_xmm(shift_amount, writable!(tmp_xmm), OperandSize::S32);
+
+            let cst = this.asm.add_constant(&SIGN_MASK.to_le_bytes());
+
+            this.asm
+                .xmm_vmovdqu_mr(&cst, writable!(tmp_xmm2), MemFlags::trusted());
+            this.asm
+                .xmm_vex_rr(AvxOpcode::Vpsrlq, tmp_xmm2, tmp_xmm, writable!(tmp_xmm2));
+            this.asm
+                .xmm_vex_rr(AvxOpcode::Vpsrlq, operand, tmp_xmm, writable!(operand));
+            this.asm
+                .xmm_vex_rr(AvxOpcode::Vpxor, operand, tmp_xmm2, writable!(operand));
+            this.asm
+                .xmm_vex_rr(AvxOpcode::Vpsubq, operand, tmp_xmm2, writable!(operand));
+
+            context.free_reg(tmp_xmm2);
+
+            Ok(())
+        };
+
+        let i8x16_shr_s = |this: &mut Self, context: &mut CodeGenContext<Emission>| -> Result<()> {
+            // Since the x86 instruction set does not have an 8x16 shift instruction and the
+            // approach used for `ishl` and `ushr` cannot be easily used (the masks do not
+            // preserve the sign), we use a different approach here: separate the low and
+            // high lanes, shift them separately, and merge them into the final result.
+            //
+            // Visually, this looks like the following, where `src.i8x16 = [s0, s1, ...,
+            // s15]:
+            //
+            //   lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
+            //   shifted_lo.i16x8 = shift each lane of `low`
+            //   hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
+            //   shifted_hi.i16x8 = shift each lane of `high`
+            //   result = [s0'', s1'', ..., s15'']
+
+            // In order for `packsswb` later to only use the high byte of each
+            // 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
+            // fill in the upper bits appropriately.
+            this.asm
+                .add_ir(8, writable!(shift_amount), OperandSize::S32);
+            this.asm
+                .avx_gpr_to_xmm(shift_amount, writable!(tmp_xmm), OperandSize::S32);
+
+            let tmp_lo = context.any_fpr(this)?;
+            let tmp_hi = context.any_fpr(this)?;
+
+            // Extract lower and upper bytes.
+            this.asm
+                .xmm_vex_rr(AvxOpcode::Vpunpcklbw, operand, operand, writable!(tmp_lo));
+            this.asm
+                .xmm_vex_rr(AvxOpcode::Vpunpckhbw, operand, operand, writable!(tmp_hi));
+
+            // Perform 16bit right shift of upper and lower bytes.
+            this.asm
+                .xmm_vex_rr(AvxOpcode::Vpsraw, tmp_lo, tmp_xmm, writable!(tmp_lo));
+            this.asm
+                .xmm_vex_rr(AvxOpcode::Vpsraw, tmp_hi, tmp_xmm, writable!(tmp_hi));
+
+            // Merge lower and upper bytes back.
+            this.asm
+                .xmm_vex_rr(AvxOpcode::Vpacksswb, tmp_lo, tmp_hi, writable!(operand));
+
+            context.free_reg(tmp_lo);
+            context.free_reg(tmp_hi);
+
+            Ok(())
+        };
+
+        match (lane_width, kind) {
+            // shl
+            (OperandSize::S8, ShiftKind::Shl) => {
+                shift_i8x16(self, &I8X16_ISHL_MASKS, AvxOpcode::Vpsllw)
+            }
+            (OperandSize::S16, ShiftKind::Shl) => shl_normal(self, AvxOpcode::Vpsllw),
+            (OperandSize::S32, ShiftKind::Shl) => shl_normal(self, AvxOpcode::Vpslld),
+            (OperandSize::S64, ShiftKind::Shl) => shl_normal(self, AvxOpcode::Vpsllq),
+            // shr_u
+            (OperandSize::S8, ShiftKind::ShrU) => {
+                shift_i8x16(self, &I8X16_USHR_MASKS, AvxOpcode::Vpsrlw)
+            }
+            (OperandSize::S16, ShiftKind::ShrU) => shl_normal(self, AvxOpcode::Vpsrlw),
+            (OperandSize::S32, ShiftKind::ShrU) => shl_normal(self, AvxOpcode::Vpsrld),
+            (OperandSize::S64, ShiftKind::ShrU) => shl_normal(self, AvxOpcode::Vpsrlq),
+            // shr_s
+            (OperandSize::S8, ShiftKind::ShrS) => i8x16_shr_s(self, context)?,
+            (OperandSize::S16, ShiftKind::ShrS) => shl_normal(self, AvxOpcode::Vpsraw),
+            (OperandSize::S32, ShiftKind::ShrS) => shl_normal(self, AvxOpcode::Vpsrad),
+            (OperandSize::S64, ShiftKind::ShrS) => i64x2_shr_s(self, context)?,
+
+            _ => bail!(CodeGenError::invalid_operand_combination()),
+        }
+
+        context.free_reg(shift_amount);
+        context
+            .stack
+            .push(TypedReg::new(WasmValType::V128, operand).into());
+        Ok(())
+    }
 }
 
 impl MacroAssembler {
diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs
index 1af99b406688..299bb144a1fa 100644
--- a/winch/codegen/src/masm.rs
+++ b/winch/codegen/src/masm.rs
@@ -1817,4 +1817,19 @@ pub(crate) trait MacroAssembler {
         context: &mut CodeGenContext<Emission>,
         lane_width: OperandSize,
     ) -> Result<()>;
+
+    /// Vectorized negate of the content of `op`, with lanes of size `size`.
+    fn v128_neg(&mut self, op: WritableReg, size: OperandSize) -> Result<()>;
+
+    /// Perform the shift operation specified by `kind`, by the shift amount specified by the 32-bit
+    /// integer at the top the the stack, on the 128-bit vector specified by the second value
+    /// from the top of the stack, interpreted as packed integers of size `lane_width`.
+    ///
+    /// The shift amount is taken modulo `lane_width`.
+    fn v128_shift(
+        &mut self,
+        context: &mut CodeGenContext<Emission>,
+        lane_width: OperandSize,
+        kind: ShiftKind,
+    ) -> Result<()>;
 }
diff --git a/winch/codegen/src/visitor.rs b/winch/codegen/src/visitor.rs
index 6bdce5e1a819..12ea71c97be9 100644
--- a/winch/codegen/src/visitor.rs
+++ b/winch/codegen/src/visitor.rs
@@ -460,6 +460,22 @@ macro_rules! def_unsupported {
     (emit I16x8SubSatS $($rest:tt)*) => {};
     (emit I8x16SubSatU $($rest:tt)*) => {};
     (emit I16x8SubSatU $($rest:tt)*) => {};
+    (emit I8x16Neg $($rest:tt)*) => {};
+    (emit I16x8Neg $($rest:tt)*) => {};
+    (emit I32x4Neg $($rest:tt)*) => {};
+    (emit I64x2Neg $($rest:tt)*) => {};
+    (emit I8x16Shl $($rest:tt)*) => {};
+    (emit I16x8Shl $($rest:tt)*) => {};
+    (emit I32x4Shl $($rest:tt)*) => {};
+    (emit I64x2Shl $($rest:tt)*) => {};
+    (emit I8x16ShrU $($rest:tt)*) => {};
+    (emit I16x8ShrU $($rest:tt)*) => {};
+    (emit I32x4ShrU $($rest:tt)*) => {};
+    (emit I64x2ShrU $($rest:tt)*) => {};
+    (emit I8x16ShrS $($rest:tt)*) => {};
+    (emit I16x8ShrS $($rest:tt)*) => {};
+    (emit I32x4ShrS $($rest:tt)*) => {};
+    (emit I64x2ShrS $($rest:tt)*) => {};
 
     (emit $unsupported:tt $($rest:tt)*) => {$($rest)*};
 }
@@ -3908,6 +3924,94 @@ where
             })
     }
 
+    fn visit_i8x16_neg(&mut self) -> Self::Output {
+        self.context.unop(self.masm, |masm, op| {
+            masm.v128_neg(writable!(op), OperandSize::S8)?;
+            Ok(TypedReg::new(WasmValType::V128, op))
+        })
+    }
+
+    fn visit_i16x8_neg(&mut self) -> Self::Output {
+        self.context.unop(self.masm, |masm, op| {
+            masm.v128_neg(writable!(op), OperandSize::S16)?;
+            Ok(TypedReg::new(WasmValType::V128, op))
+        })
+    }
+
+    fn visit_i32x4_neg(&mut self) -> Self::Output {
+        self.context.unop(self.masm, |masm, op| {
+            masm.v128_neg(writable!(op), OperandSize::S32)?;
+            Ok(TypedReg::new(WasmValType::V128, op))
+        })
+    }
+
+    fn visit_i64x2_neg(&mut self) -> Self::Output {
+        self.context.unop(self.masm, |masm, op| {
+            masm.v128_neg(writable!(op), OperandSize::S64)?;
+            Ok(TypedReg::new(WasmValType::V128, op))
+        })
+    }
+
+    fn visit_i8x16_shl(&mut self) -> Self::Output {
+        self.masm
+            .v128_shift(&mut self.context, OperandSize::S8, ShiftKind::Shl)
+    }
+
+    fn visit_i16x8_shl(&mut self) -> Self::Output {
+        self.masm
+            .v128_shift(&mut self.context, OperandSize::S16, ShiftKind::Shl)
+    }
+
+    fn visit_i32x4_shl(&mut self) -> Self::Output {
+        self.masm
+            .v128_shift(&mut self.context, OperandSize::S32, ShiftKind::Shl)
+    }
+
+    fn visit_i64x2_shl(&mut self) -> Self::Output {
+        self.masm
+            .v128_shift(&mut self.context, OperandSize::S64, ShiftKind::Shl)
+    }
+
+    fn visit_i8x16_shr_u(&mut self) -> Self::Output {
+        self.masm
+            .v128_shift(&mut self.context, OperandSize::S8, ShiftKind::ShrU)
+    }
+
+    fn visit_i16x8_shr_u(&mut self) -> Self::Output {
+        self.masm
+            .v128_shift(&mut self.context, OperandSize::S16, ShiftKind::ShrU)
+    }
+
+    fn visit_i32x4_shr_u(&mut self) -> Self::Output {
+        self.masm
+            .v128_shift(&mut self.context, OperandSize::S32, ShiftKind::ShrU)
+    }
+
+    fn visit_i64x2_shr_u(&mut self) -> Self::Output {
+        self.masm
+            .v128_shift(&mut self.context, OperandSize::S64, ShiftKind::ShrU)
+    }
+
+    fn visit_i8x16_shr_s(&mut self) -> Self::Output {
+        self.masm
+            .v128_shift(&mut self.context, OperandSize::S8, ShiftKind::ShrS)
+    }
+
+    fn visit_i16x8_shr_s(&mut self) -> Self::Output {
+        self.masm
+            .v128_shift(&mut self.context, OperandSize::S16, ShiftKind::ShrS)
+    }
+
+    fn visit_i32x4_shr_s(&mut self) -> Self::Output {
+        self.masm
+            .v128_shift(&mut self.context, OperandSize::S32, ShiftKind::ShrS)
+    }
+
+    fn visit_i64x2_shr_s(&mut self) -> Self::Output {
+        self.masm
+            .v128_shift(&mut self.context, OperandSize::S64, ShiftKind::ShrS)
+    }
+
     wasmparser::for_each_visit_simd_operator!(def_unsupported);
 }