diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index 51775e0a23..72aaab9276 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1371,7 +1371,7 @@ pub(crate) fn emit(
                 RegMemImm::Imm { simm32 } => {
                     // FIXME JRS 2020Feb11: there are shorter encodings for
                     // cmp $imm, rax/eax/ax/al.
-                    let use_imm8 = low8_will_sign_extend_to_32(*simm32);
+                    let use_imm8 = is_cmp && low8_will_sign_extend_to_32(*simm32);
 
                     // And also here we use the "normal" G-E ordering.
                     let opcode = if is_cmp {
diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
index 3178b1dd11..f5d7535da4 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -2999,6 +2999,11 @@ fn test_x64_emit() {
         "48855763",
         "testq   99(%rdi), %rdx",
     ));
+    insns.push((
+        Inst::test_rmi_r(OperandSize::Size64, RegMemImm::imm(127), rdx),
+        "48F7C27F000000",
+        "testq   $127, %rdx",
+    ));
     insns.push((
         Inst::test_rmi_r(OperandSize::Size64, RegMemImm::imm(76543210), rdx),
         "48F7C2EAF48F04",
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index c720b28a32..019c416597 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -886,8 +886,10 @@ fn emit_shl_i128<C: LowerCtx<I = Inst>>(
     // sub amt, amt_src
     // mov tmp3, src_lo
     // shr tmp3, amt
-    // or tmp3, tmp2
     // xor dst_lo, dst_lo
+    // test amt_src, 127
+    // cmovz tmp2, dst_lo
+    // or tmp3, tmp2
     // mov amt, amt_src
     // and amt, 64
     // cmovz dst_hi, tmp3
@@ -945,6 +947,24 @@ fn emit_shl_i128<C: LowerCtx<I = Inst>>(
         None,
         tmp3,
     ));
+    ctx.emit(Inst::alu_rmi_r(
+        OperandSize::Size64,
+        AluRmiROpcode::Xor,
+        RegMemImm::reg(dst_lo.to_reg()),
+        dst_lo,
+    ));
+
+    ctx.emit(Inst::test_rmi_r(
+        OperandSize::Size64,
+        RegMemImm::imm(127),
+        amt_src,
+    ));
+    ctx.emit(Inst::cmove(
+        OperandSize::Size64,
+        CC::Z,
+        RegMem::reg(dst_lo.to_reg()),
+        tmp2,
+    ));
 
     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
@@ -953,12 +973,6 @@ fn emit_shl_i128<C: LowerCtx<I = Inst>>(
         tmp3,
     ));
 
-    ctx.emit(Inst::alu_rmi_r(
-        OperandSize::Size64,
-        AluRmiROpcode::Xor,
-        RegMemImm::reg(dst_lo.to_reg()),
-        dst_lo,
-    ));
     // This isn't semantically necessary, but it keeps the
     // register allocator happy, because it cannot otherwise
     // infer that cmovz + cmovnz always defines dst_hi.
@@ -1011,11 +1025,14 @@ fn emit_shr_i128<C: LowerCtx<I = Inst>>(
     // mov tmp1, src_hi
     // {u,s}shr tmp1, amt_src
     // mov tmp2, src_lo
-    // {u,s}shr tmp2, amt_src
+    // ushr tmp2, amt_src
     // mov amt, 64
     // sub amt, amt_src
     // mov tmp3, src_hi
     // shl tmp3, amt
+    // xor dst_lo, dst_lo
+    // test amt_src, 127
+    // cmovz tmp3, dst_lo
     // or tmp3, tmp2
     // if is_signed:
     //   mov dst_hi, src_hi
@@ -1053,7 +1070,13 @@ fn emit_shr_i128<C: LowerCtx<I = Inst>>(
         amt_src,
         types::I64,
     ));
-    ctx.emit(Inst::shift_r(OperandSize::Size64, shift_kind, None, tmp2));
+    // N.B.: right-shift of *lower* half is *always* unsigned (its MSB is not a sign bit).
+    ctx.emit(Inst::shift_r(
+        OperandSize::Size64,
+        ShiftKind::ShiftRightLogical,
+        None,
+        tmp2,
+    ));
 
     ctx.emit(Inst::imm(OperandSize::Size64, 64, amt));
     ctx.emit(Inst::alu_rmi_r(
@@ -1076,6 +1099,24 @@ fn emit_shr_i128<C: LowerCtx<I = Inst>>(
         tmp3,
     ));
 
+    ctx.emit(Inst::alu_rmi_r(
+        OperandSize::Size64,
+        AluRmiROpcode::Xor,
+        RegMemImm::reg(dst_lo.to_reg()),
+        dst_lo,
+    ));
+    ctx.emit(Inst::test_rmi_r(
+        OperandSize::Size64,
+        RegMemImm::imm(127),
+        amt_src,
+    ));
+    ctx.emit(Inst::cmove(
+        OperandSize::Size64,
+        CC::Z,
+        RegMem::reg(dst_lo.to_reg()),
+        tmp3,
+    ));
+
     ctx.emit(Inst::alu_rmi_r(
         OperandSize::Size64,
         AluRmiROpcode::Or,
@@ -1957,7 +1998,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         let cst = (cst as u8) & (dst_ty.bits() as u8 - 1);
                         (Some(cst), None)
                     } else {
-                        (None, Some(put_input_in_reg(ctx, inputs[1])))
+                        // We can ignore upper registers if shift amount is multi-reg, because we
+                        // are taking the shift amount mod 2^(lhs_width) anyway.
+                        (None, Some(put_input_in_regs(ctx, inputs[1]).regs()[0]))
                     };
 
                 let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
diff --git a/cranelift/filetests/filetests/isa/x64/i128.clif b/cranelift/filetests/filetests/isa/x64/i128.clif
index 5e5d2ffb86..9b1436898e 100644
--- a/cranelift/filetests/filetests/isa/x64/i128.clif
+++ b/cranelift/filetests/filetests/isa/x64/i128.clif
@@ -644,212 +644,9 @@ block0(v0: i128):
 ; nextln:  ret
 }
 
-function %f21(i128, i32) -> i128 {
-; check:   pushq   %rbp
-; nextln:  movq    %rsp, %rbp
+; Shifts are covered by run-tests in shift-i128-run.clif.
 
-block0(v0: i128, v1: i32):
-    v2 = ushr v0, v1
-    return v2
-
-; check:  movq    %rdi, %rax
-; nextln: movq    %rsi, %rdi
-; nextln: movq    %rdi, %rsi
-; nextln: movq    %rdx, %rcx
-; nextln: shrq    %cl, %rsi
-; nextln: movq    %rdx, %rcx
-; nextln: shrq    %cl, %rax
-; nextln: movl    $$64, %ecx
-; nextln: subq    %rdx, %rcx
-; nextln: shlq    %cl, %rdi
-; nextln: orq     %rax, %rdi
-; nextln: xorq    %rax, %rax
-; nextln: xorq    %rcx, %rcx
-; nextln: andq    $$64, %rdx
-; nextln: cmovzq  %rsi, %rax
-; nextln: cmovzq  %rdi, %rcx
-; nextln: cmovnzq %rsi, %rcx
-; nextln: movq    %rax, %rdx
-; nextln: movq    %rcx, %rax
-
-; nextln:  movq    %rbp, %rsp
-; nextln:  popq    %rbp
-; nextln:  ret
-}
-
-function %f22(i128, i32) -> i128 {
-; check:   pushq   %rbp
-; nextln:  movq    %rsp, %rbp
-
-block0(v0: i128, v1: i32):
-    v2 = ishl v0, v1
-    return v2
-
-; check:  movq    %rsi, %rax
-; nextln: movq    %rdi, %rsi
-; nextln: movq    %rdx, %rcx
-; nextln: shlq    %cl, %rsi
-; nextln: movq    %rdx, %rcx
-; nextln: shlq    %cl, %rax
-; nextln: movl    $$64, %ecx
-; nextln: subq    %rdx, %rcx
-; nextln: shrq    %cl, %rdi
-; nextln: orq     %rax, %rdi
-; nextln: xorq    %rax, %rax
-; nextln: xorq    %rcx, %rcx
-; nextln: andq    $$64, %rdx
-; nextln: cmovzq  %rdi, %rcx
-; nextln: cmovzq  %rsi, %rax
-; nextln: cmovnzq %rsi, %rcx
-; nextln: movq    %rcx, %rdx
-
-; nextln:  movq    %rbp, %rsp
-; nextln:  popq    %rbp
-; nextln:  ret
-}
-
-function %f23(i128, i32) -> i128 {
-; check:   pushq   %rbp
-; nextln:  movq    %rsp, %rbp
-
-block0(v0: i128, v1: i32):
-    v2 = sshr v0, v1
-    return v2
-
-; check:  movq    %rdi, %r8
-; nextln: movq    %rsi, %rdi
-; nextln: movq    %rdi, %rsi
-; nextln: movq    %rdx, %rcx
-; nextln: sarq    %cl, %rsi
-; nextln: movq    %rdx, %rcx
-; nextln: sarq    %cl, %r8
-; nextln: movl    $$64, %ecx
-; nextln: subq    %rdx, %rcx
-; nextln: movq    %rdi, %rax
-; nextln: shlq    %cl, %rax
-; nextln: orq     %r8, %rax
-; nextln: sarq    $$63, %rdi
-; nextln: xorq    %rcx, %rcx
-; nextln: andq    $$64, %rdx
-; nextln: cmovzq  %rsi, %rdi
-; nextln: cmovzq  %rax, %rcx
-; nextln: cmovnzq %rsi, %rcx
-; nextln: movq    %rcx, %rax
-; nextln: movq    %rdi, %rdx
-
-; nextln:  movq    %rbp, %rsp
-; nextln:  popq    %rbp
-; nextln:  ret
-}
-
-function %f24(i128, i32) -> i128 {
-; check:   pushq   %rbp
-; nextln:  movq    %rsp, %rbp
-
-block0(v0: i128, v1: i32):
-    v2 = rotr.i128 v0, v1
-    return v2
-
-; check:  movq    %rsi, %r9
-; nextln: movq    %rdx, %rcx
-; nextln: shrq    %cl, %r9
-; nextln: movq    %rdi, %rax
-; nextln: movq    %rdx, %rcx
-; nextln: shrq    %cl, %rax
-; nextln: movl    $$64, %ecx
-; nextln: subq    %rdx, %rcx
-; nextln: movq    %rsi, %r10
-; nextln: shlq    %cl, %r10
-; nextln: orq     %rax, %r10
-; nextln: xorq    %r8, %r8
-; nextln: xorq    %rax, %rax
-; nextln: movq    %rdx, %rcx
-; nextln: andq    $$64, %rcx
-; nextln: cmovzq  %r9, %r8
-; nextln: cmovzq  %r10, %rax
-; nextln: cmovnzq %r9, %rax
-; nextln: movl    $$128, %r9d
-; nextln: subq    %rdx, %r9
-; nextln: movq    %rdi, %rdx
-; nextln: movq    %r9, %rcx
-; nextln: shlq    %cl, %rdx
-; nextln: movq    %r9, %rcx
-; nextln: shlq    %cl, %rsi
-; nextln: movl    $$64, %ecx
-; nextln: subq    %r9, %rcx
-; nextln: movq    %rdi, %r10
-; nextln: shrq    %cl, %r10
-; nextln: orq     %rsi, %r10
-; nextln: xorq    %rsi, %rsi
-; nextln: xorq    %rdi, %rdi
-; nextln: andq    $$64, %r9
-; nextln: cmovzq  %r10, %rdi
-; nextln: cmovzq  %rdx, %rsi
-; nextln: cmovnzq %rdx, %rdi
-; nextln: orq     %rax, %rsi
-; nextln: orq     %r8, %rdi
-; nextln: movq    %rsi, %rax
-; nextln: movq    %rdi, %rdx
-
-; nextln:  movq    %rbp, %rsp
-; nextln:  popq    %rbp
-; nextln:  ret
-}
-
-function %f25(i128, i32) -> i128 {
-; check:   pushq   %rbp
-; nextln:  movq    %rsp, %rbp
-
-block0(v0: i128, v1: i32):
-    v2 = rotl.i128 v0, v1
-    return v2
-
-; check:  movq    %rdi, %r9
-; nextln: movq    %rdx, %rcx
-; nextln: shlq    %cl, %r9
-; nextln: movq    %rsi, %rax
-; nextln: movq    %rdx, %rcx
-; nextln: shlq    %cl, %rax
-; nextln: movl    $$64, %ecx
-; nextln: subq    %rdx, %rcx
-; nextln: movq    %rdi, %r10
-; nextln: shrq    %cl, %r10
-; nextln: orq     %rax, %r10
-; nextln: xorq    %r8, %r8
-; nextln: xorq    %rax, %rax
-; nextln: movq    %rdx, %rcx
-; nextln: andq    $$64, %rcx
-; nextln: cmovzq  %r10, %rax
-; nextln: cmovzq  %r9, %r8
-; nextln: cmovnzq %r9, %rax
-; nextln: movl    $$128, %r9d
-; nextln: subq    %rdx, %r9
-; nextln: movq    %rsi, %rdx
-; nextln: movq    %r9, %rcx
-; nextln: shrq    %cl, %rdx
-; nextln: movq    %r9, %rcx
-; nextln: shrq    %cl, %rdi
-; nextln: movl    $$64, %ecx
-; nextln: subq    %r9, %rcx
-; nextln: shlq    %cl, %rsi
-; nextln: orq     %rdi, %rsi
-; nextln: xorq    %rdi, %rdi
-; nextln: xorq    %rcx, %rcx
-; nextln: andq    $$64, %r9
-; nextln: cmovzq  %rdx, %rdi
-; nextln: cmovzq  %rsi, %rcx
-; nextln: cmovnzq %rdx, %rcx
-; nextln: orq     %r8, %rcx
-; nextln: orq     %rax, %rdi
-; nextln: movq    %rcx, %rax
-; nextln: movq    %rdi, %rdx
-
-; nextln:  movq    %rbp, %rsp
-; nextln:  popq    %rbp
-; nextln:  ret
-}
-
-function %f26(i128, i64) {
+function %f21(i128, i64) {
 ; check:   pushq   %rbp
 ; nextln:  movq    %rsp, %rbp
 
@@ -865,7 +662,7 @@ block0(v0: i128, v1: i64):
 ; nextln:  ret
 }
 
-function %f27(i64) -> i128 {
+function %f22(i64) -> i128 {
 ; check:   pushq   %rbp
 ; nextln:  movq    %rsp, %rbp
 
@@ -883,7 +680,7 @@ block0(v0: i64):
 ; nextln:  ret
 }
 
-function %f28(i128, b1) -> i128 {
+function %f23(i128, b1) -> i128 {
 block0(v0: i128, v1: b1):
     v2 = iconst.i128 0
     brnz v1, block1(v2)
@@ -930,7 +727,7 @@ block2(v6: i128):
  
 }
 
-function %f29(i128, i128, i64, i128, i128, i128) -> i128 {
+function %f24(i128, i128, i64, i128, i128, i128) -> i128 {
 
 block0(v0: i128, v1: i128, v2: i64, v3: i128, v4: i128, v5: i128):
     v6 = iadd.i128 v0, v1
@@ -974,7 +771,7 @@ block0(v0: i128, v1: i128, v2: i64, v3: i128, v4: i128, v5: i128):
 
 }
 
-function %f30(i128) -> i128, i128, i128, i64, i128, i128 {
+function %f25(i128) -> i128, i128, i128, i64, i128, i128 {
 ; check:   pushq   %rbp
 ; nextln:  movq    %rsp, %rbp
 
@@ -996,7 +793,7 @@ block0(v0: i128):
 
 }
 
-function %f31(i128, i128) -> i128, i128 {
+function %f26(i128, i128) -> i128, i128 {
     fn0 = %g(i128, i128) -> i128, i128
 block0(v0: i128, v1: i128):
     v2, v3 = call fn0(v0, v1)
@@ -1027,7 +824,7 @@ block0(v0: i128, v1: i128):
 
 }
 
-function %f32(i128) -> i128 {
+function %f27(i128) -> i128 {
 block0(v0: i128):
     v1 = clz.i128 v0
     return v1
@@ -1056,7 +853,7 @@ block0(v0: i128):
 
 }
 
-function %f33(i128) -> i128 {
+function %f28(i128) -> i128 {
 block0(v0: i128):
     v1 = ctz.i128 v0
     return v1
@@ -1080,3 +877,18 @@ block0(v0: i128):
 ; nextln: movq    %rbp, %rsp
 ; nextln: popq    %rbp
 ; nextln: ret
+
+function %f29(i8, i128) -> i8 {
+block0(v0: i8, v1: i128):
+    v2 = ishl v0, v1
+    return v2
+}
+
+; check:  pushq   %rbp
+; nextln: movq    %rsp, %rbp
+; nextln: movq    %rsi, %rcx
+; nextln: shll    %cl, %edi
+; nextln: movq    %rdi, %rax
+; nextln: movq    %rbp, %rsp
+; nextln: popq    %rbp
+; nextln: ret
diff --git a/cranelift/filetests/filetests/isa/x64/shift-i128-run.clif b/cranelift/filetests/filetests/isa/x64/shift-i128-run.clif
index 37bc4667e7..991b2303cf 100644
--- a/cranelift/filetests/filetests/isa/x64/shift-i128-run.clif
+++ b/cranelift/filetests/filetests/isa/x64/shift-i128-run.clif
@@ -2,105 +2,63 @@ test run
 target x86_64
 feature "experimental_x64"
 
-function %ishl1() -> b1 {
-block0:
-    v0 = iconst.i64 0x01010101_01010101
-    v1 = iconcat v0, v0
-    v2 = iconst.i32 2
-    v3 = ishl.i128 v1, v2
-    v4 = iconst.i64 0x04040404_04040404
-    v5 = iconcat v4, v4
-    v6 = icmp eq v3, v5
-    return v6
+function %ishl(i64, i64, i8) -> i64, i64 {
+block0(v0: i64, v1: i64, v2: i8):
+    v3 = iconcat v0, v1
+    v4 = ishl.i128 v3, v2
+    v5, v6 = isplit v4
+    return v5, v6
 }
-; run
+; run: %ishl(0x01010101_01010101, 0x01010101_01010101, 2) == [0x04040404_04040404, 0x04040404_04040404]
+; run: %ishl(0x01010101_01010101, 0x01010101_01010101, 9) == [0x02020202_02020200, 0x02020202_02020202]
+; run: %ishl(0x01010101_01010101, 0xffffffff_ffffffff, 66) == [0x00000000_00000000, 0x04040404_04040404]
+; run: %ishl(0x01010101_01010101, 0x01010101_01010101, 0) == [0x01010101_01010101, 0x01010101_01010101]
+; run: %ishl(0x01010101_01010101, 0x01010101_01010101, 128) == [0x01010101_01010101, 0x01010101_01010101]
 
-function %ishl2() -> b1 {
-block0:
-    v0 = iconst.i64 0x01010101_01010101
-    v1 = iconst.i64 0x01010101_01010101
-    v2 = iconcat v0, v1
-    v3 = iconst.i32 9
-    v4 = ishl.i128 v2, v3
-    v5 = iconst.i64 0x02020202_02020200
-    v6 = iconst.i64 0x02020202_02020202
-    v7 = iconcat v5, v6
-    v8 = icmp eq v4, v7
-    return v8
+function %ushr(i64, i64, i8) -> i64, i64 {
+block0(v0: i64, v1: i64, v2: i8):
+    v3 = iconcat v0, v1
+    v4 = ushr.i128 v3, v2
+    v5, v6 = isplit v4
+    return v5, v6
 }
-; run
+; run: %ushr(0x01010101_01010101, 0x01010101_01010101, 2) == [0x40404040_40404040, 0x00404040_40404040]
+; run: %ushr(0x01010101_01010101, 0x01010101_01010101, 66) == [0x00404040_40404040, 0x00000000_00000000]
+; run: %ushr(0x01010101_01010101, 0x01010101_01010101, 0) == [0x01010101_01010101, 0x01010101_01010101]
+; run: %ushr(0x01010101_01010101, 0x01010101_01010101, 128) == [0x01010101_01010101, 0x01010101_01010101]
 
-function %ishl3() -> b1 {
-block0:
-    v0 = iconst.i64 0x01010101_01010101
-    v1 = iconst.i64 0xffffffff_ffffffff
-    v2 = iconcat v0, v1
-    v3 = iconst.i32 66
-    v4 = ishl.i128 v2, v3
-    v5 = iconst.i64 0x00000000_00000000
-    v6 = iconst.i64 0x04040404_04040404
-    v7 = iconcat v5, v6
-    v8 = icmp eq v4, v7
-    return v8
+function %sshr(i64, i64, i8) -> i64, i64 {
+block0(v0: i64, v1: i64, v2: i8):
+    v3 = iconcat v0, v1
+    v4 = sshr.i128 v3, v2
+    v5, v6 = isplit v4
+    return v5, v6
 }
-; run
+; run: %sshr(0x01010101_01010101, 0x81010101_01010101, 2) == [0x40404040_40404040, 0xe0404040_40404040]
+; run: %sshr(0x12345678_9abcdef0, 0x80101010_10101010, 66) == [0xe0040404_04040404, 0xffffffff_ffffffff]
+; run: %sshr(0x12345678_9abcdef0, 0x80101010_10101010, 0) == [0x12345678_9abcdef0, 0x80101010_10101010]
+; run: %sshr(0x12345678_9abcdef0, 0x80101010_10101010, 128) == [0x12345678_9abcdef0, 0x80101010_10101010]
 
-function %ushr1() -> b1 {
-block0:
-    v0 = iconst.i64 0x01010101_01010101
-    v1 = iconst.i64 0x01010101_01010101
-    v2 = iconcat v0, v1
-    v3 = iconst.i32 2
-    v4 = ushr.i128 v2, v3
-    v5 = iconst.i64 0x40404040_40404040
-    v6 = iconst.i64 0x00404040_40404040
-    v7 = iconcat v5, v6
-    v8 = icmp eq v4, v7
-    return v8
+function %rotl(i64, i64, i8) -> i64, i64 {
+block0(v0: i64, v1: i64, v2: i8):
+    v3 = iconcat v0, v1
+    v4 = rotl.i128 v3, v2
+    v5, v6 = isplit v4
+    return v5, v6
 }
-; run
+; run: %rotl(0x01010101_01010101, 0x01010101_01010101, 9) == [0x02020202_02020202, 0x02020202_02020202]
+; run: %rotl(0x01010101_01010101, 0x01010101_01010101, 73) == [0x02020202_02020202, 0x02020202_02020202]
+; run: %rotl(0x01010101_01010101, 0x01010101_01010101, 0) == [0x01010101_01010101, 0x01010101_01010101]
+; run: %rotl(0x01010101_01010101, 0x01010101_01010101, 128) == [0x01010101_01010101, 0x01010101_01010101]
 
-function %ushr2() -> b1 {
-block0:
-    v0 = iconst.i64 0x01010101_01010101
-    v1 = iconst.i64 0x01010101_01010101
-    v2 = iconcat v0, v1
-    v3 = iconst.i32 66
-    v4 = ushr.i128 v2, v3
-    v5 = iconst.i64 0x00404040_40404040
-    v6 = iconst.i64 0x00000000_00000000
-    v7 = iconcat v5, v6
-    v8 = icmp eq v4, v7
-    return v8
+function %rotr(i64, i64, i8) -> i64, i64 {
+block0(v0: i64, v1: i64, v2: i8):
+    v3 = iconcat v0, v1
+    v4 = rotr.i128 v3, v2
+    v5, v6 = isplit v4
+    return v5, v6
 }
-; run
-
-function %sshr1() -> b1 {
-block0:
-    v0 = iconst.i64 0x01010101_01010101
-    v1 = iconst.i64 0x81010101_01010101
-    v2 = iconcat v0, v1
-    v3 = iconst.i32 2
-    v4 = sshr.i128 v2, v3
-    v5 = iconst.i64 0x40404040_40404040
-    v6 = iconst.i64 0xe0404040_40404040
-    v7 = iconcat v5, v6
-    v8 = icmp eq v4, v7
-    return v8
-}
-; run
-
-function %sshr2() -> b1 {
-block0:
-    v0 = iconst.i64 0x12345678_9abcdef0
-    v1 = iconst.i64 0x80101010_10101010
-    v2 = iconcat v0, v1
-    v3 = iconst.i32 66
-    v4 = sshr.i128 v2, v3
-    v5 = iconst.i64 0xe0040404_04040404
-    v6 = iconst.i64 0xffffffff_ffffffff
-    v7 = iconcat v5, v6
-    v8 = icmp eq v4, v7
-    return v8
-}
-; run
+; run: %rotr(0x01010101_01010101, 0x01010101_01010101, 9) == [0x80808080_80808080, 0x80808080_80808080]
+; run: %rotr(0x01010101_01010101, 0x01010101_01010101, 73) == [0x80808080_80808080, 0x80808080_80808080]
+; run: %rotr(0x01010101_01010101, 0x01010101_01010101, 0) == [0x01010101_01010101, 0x01010101_01010101]
+; run: %rotr(0x01010101_01010101, 0x01010101_01010101, 128) == [0x01010101_01010101, 0x01010101_01010101]